1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina@intel.com
3 //*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved.
5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
7 //By downloading, copying, installing or using the software you agree to this license.
8 //If you do not agree to this license, do not download, install, copy or use the software.
12 //Permission to use, copy, modify, and/or distribute this software for any
13 //purpose with or without fee is hereby granted, provided that the above
14 //copyright notice and this permission notice appear in all copies.
16 //THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
17 //REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
18 //AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
19 //INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
20 //LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
21 //OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
22 //PERFORMANCE OF THIS SOFTWARE.
24 //*****************************************************************************************
25 // This file is intended to simplify ARM->IA32 porting
26 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
27 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
28 // MMX instruction set is not used due to performance overhead and the necessity to use the
29 // EMMS instruction (_mm_empty())for mmx-x87 floating point switching
30 //*****************************************************************************************
32 //!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
33 //!!!!!!! Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
34 //!!!!!!! greater performance. It can be done by -msse4.2 compiler switch.
40 #if defined(__SSE4_2__)
45 #include <xmmintrin.h> //SSE
46 #include <emmintrin.h> //SSE2
47 #include <pmmintrin.h> //SSE3
48 #include <tmmintrin.h> //SSSE3
50 #include <smmintrin.h> //SSE4.1
51 #include <nmmintrin.h> //SSE4.2
55 //*************** functions and data attributes, compiler dependent *********************************
56 //***********************************************************************************
58 #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
59 #define _NEON2SSE_ALIGN_16 __attribute__((aligned(16)))
60 #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
61 #if _GCC_VERSION < 40500
62 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function
64 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function
66 #if defined(__x86_64__)
67 #define _NEON2SSE_64BIT __x86_64__
70 #define _NEON2SSE_ALIGN_16 __declspec(align(16))
71 #define _NEON2SSE_INLINE __inline
72 #if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
73 #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
75 #define _NEON2SSE_64BIT _M_X64
78 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function
82 #if defined (_NEON2SSE_64BIT) && defined (USE_SSE4)
83 #define _NEON2SSE_64BIT_SSE4
86 /*********************************************************************************************************************/
87 // data types conversion
88 /*********************************************************************************************************************/
89 #if defined(_MSC_VER) && (_MSC_VER < 1300)
90 typedef signed char int8_t;
91 typedef unsigned char uint8_t;
92 typedef signed short int16_t;
93 typedef unsigned short uint16_t;
94 typedef signed int int32_t;
95 typedef unsigned int uint32_t;
96 typedef signed long long int64_t;
97 typedef unsigned long long uint64_t;
98 #elif defined(_MSC_VER)
99 typedef signed __int8
int8_t;
100 typedef unsigned __int8
uint8_t;
101 typedef signed __int16
int16_t;
102 typedef unsigned __int16
uint16_t;
103 typedef signed __int32
int32_t;
104 typedef unsigned __int32
uint32_t;
106 typedef signed long long int64_t;
107 typedef unsigned long long uint64_t;
113 typedef union __m64_128
{
125 typedef __m64_128 int8x8_t
;
126 typedef __m64_128 uint8x8_t
;
127 typedef __m64_128 int16x4_t
;
128 typedef __m64_128 uint16x4_t
;
129 typedef __m64_128 int32x2_t
;
130 typedef __m64_128 uint32x2_t
;
131 typedef __m64_128 int64x1_t
;
132 typedef __m64_128 uint64x1_t
;
133 typedef __m64_128 poly8x8_t
;
134 typedef __m64_128 poly16x4_t
;
136 typedef __m64_128 float32x2_t
;
137 typedef __m128 float32x4_t
;
139 typedef __m128 float16x4_t
; //not supported by IA, for compatibility
140 typedef __m128 float16x8_t
; //not supported by IA, for compatibility
142 typedef __m128i int8x16_t
;
143 typedef __m128i int16x8_t
;
144 typedef __m128i int32x4_t
;
145 typedef __m128i int64x2_t
;
146 typedef __m128i uint8x16_t
;
147 typedef __m128i uint16x8_t
;
148 typedef __m128i uint32x4_t
;
149 typedef __m128i uint64x2_t
;
150 typedef __m128i poly8x16_t
;
151 typedef __m128i poly16x8_t
;
153 #if defined(_MSC_VER)
154 #define SINT_MIN (-2147483647 - 1) /* min signed int value */
155 #define SINT_MAX 2147483647 /* max signed int value */
157 #define SINT_MIN INT_MIN /* min signed int value */
158 #define SINT_MAX INT_MAX /* max signed int value */
161 typedef float float32_t
;
162 typedef float __fp16
;
164 typedef uint8_t poly8_t
;
165 typedef uint16_t poly16_t
;
168 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in
169 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types
182 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
196 typedef struct int8x16x2_t int8x16x2_t
; //for C compilers to make them happy
197 typedef struct int16x8x2_t int16x8x2_t
; //for C compilers to make them happy
198 typedef struct int32x4x2_t int32x4x2_t
; //for C compilers to make them happy
199 typedef struct int64x2x2_t int64x2x2_t
; //for C compilers to make them happy
201 typedef struct int8x8x2_t int8x8x2_t
; //for C compilers to make them happy
202 typedef struct int16x4x2_t int16x4x2_t
; //for C compilers to make them happy
203 typedef struct int32x2x2_t int32x2x2_t
; //for C compilers to make them happy
204 typedef struct int64x1x2_t int64x1x2_t
; //for C compilers to make them happy
206 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
207 typedef struct int8x16x2_t uint8x16x2_t
;
208 typedef struct int16x8x2_t uint16x8x2_t
;
209 typedef struct int32x4x2_t uint32x4x2_t
;
210 typedef struct int64x2x2_t uint64x2x2_t
;
211 typedef struct int8x16x2_t poly8x16x2_t
;
212 typedef struct int16x8x2_t poly16x8x2_t
;
214 typedef struct int8x8x2_t uint8x8x2_t
;
215 typedef struct int16x4x2_t uint16x4x2_t
;
216 typedef struct int32x2x2_t uint32x2x2_t
;
217 typedef struct int64x1x2_t uint64x1x2_t
;
218 typedef struct int8x8x2_t poly8x8x2_t
;
219 typedef struct int16x4x2_t poly16x4x2_t
;
222 struct float32x4x2_t
{
225 struct float16x8x2_t
{
228 struct float32x2x2_t
{
232 typedef struct float32x4x2_t float32x4x2_t
; //for C compilers to make them happy
233 typedef struct float16x8x2_t float16x8x2_t
; //for C compilers to make them happy
234 typedef struct float32x2x2_t float32x2x2_t
; //for C compilers to make them happy
235 typedef float16x8x2_t float16x4x2_t
;
264 typedef struct int8x16x4_t int8x16x4_t
; //for C compilers to make them happy
265 typedef struct int16x8x4_t int16x8x4_t
; //for C compilers to make them happy
266 typedef struct int32x4x4_t int32x4x4_t
; //for C compilers to make them happy
267 typedef struct int64x2x4_t int64x2x4_t
; //for C compilers to make them happy
269 typedef struct int8x8x4_t int8x8x4_t
; //for C compilers to make them happy
270 typedef struct int16x4x4_t int16x4x4_t
; //for C compilers to make them happy
271 typedef struct int32x2x4_t int32x2x4_t
; //for C compilers to make them happy
272 typedef struct int64x1x4_t int64x1x4_t
; //for C compilers to make them happy
274 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
275 typedef struct int8x8x4_t uint8x8x4_t
;
276 typedef struct int16x4x4_t uint16x4x4_t
;
277 typedef struct int32x2x4_t uint32x2x4_t
;
278 typedef struct int64x1x4_t uint64x1x4_t
;
279 typedef struct int8x8x4_t poly8x8x4_t
;
280 typedef struct int16x4x4_t poly16x4x4_t
;
282 typedef struct int8x16x4_t uint8x16x4_t
;
283 typedef struct int16x8x4_t uint16x8x4_t
;
284 typedef struct int32x4x4_t uint32x4x4_t
;
285 typedef struct int64x2x4_t uint64x2x4_t
;
286 typedef struct int8x16x4_t poly8x16x4_t
;
287 typedef struct int16x8x4_t poly16x8x4_t
;
289 struct float32x4x4_t
{
292 struct float16x8x4_t
{
295 struct float32x2x4_t
{
299 typedef struct float32x4x4_t float32x4x4_t
; //for C compilers to make them happy
300 typedef struct float16x8x4_t float16x8x4_t
; //for C compilers to make them happy
301 typedef struct float32x2x4_t float32x2x4_t
; //for C compilers to make them happy
302 typedef float16x8x4_t float16x4x4_t
;
330 typedef struct int16x8x3_t int16x8x3_t
; //for C compilers to make them happy
331 typedef struct int32x4x3_t int32x4x3_t
; //for C compilers to make them happy
332 typedef struct int64x2x3_t int64x2x3_t
; //for C compilers to make them happy
333 typedef struct int8x16x3_t int8x16x3_t
; //for C compilers to make them happy
335 typedef struct int8x8x3_t int8x8x3_t
; //for C compilers to make them happy
336 typedef struct int16x4x3_t int16x4x3_t
; //for C compilers to make them happy
337 typedef struct int32x2x3_t int32x2x3_t
; //for C compilers to make them happy
338 typedef struct int64x1x3_t int64x1x3_t
; //for C compilers to make them happy
341 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
342 typedef struct int8x16x3_t uint8x16x3_t
;
343 typedef struct int16x8x3_t uint16x8x3_t
;
344 typedef struct int32x4x3_t uint32x4x3_t
;
345 typedef struct int64x2x3_t uint64x2x3_t
;
346 typedef struct int8x16x3_t poly8x16x3_t
;
347 typedef struct int16x8x3_t poly16x8x3_t
;
348 typedef struct int8x8x3_t uint8x8x3_t
;
349 typedef struct int16x4x3_t uint16x4x3_t
;
350 typedef struct int32x2x3_t uint32x2x3_t
;
351 typedef struct int64x1x3_t uint64x1x3_t
;
352 typedef struct int8x8x3_t poly8x8x3_t
;
353 typedef struct int16x4x3_t poly16x4x3_t
;
356 struct float32x4x3_t
{
359 struct float32x2x3_t
{
362 struct float16x8x3_t
{
366 typedef struct float32x4x3_t float32x4x3_t
; //for C compilers to make them happy
367 typedef struct float16x8x3_t float16x8x3_t
; //for C compilers to make them happy
368 typedef struct float32x2x3_t float32x2x3_t
; //for C compilers to make them happy
369 typedef float16x8x3_t float16x4x3_t
;
372 //****************************************************************************
373 //****** Porting auxiliary macros ********************************************
375 //** floating point related macros **
376 #define _M128i(a) _mm_castps_si128(a)
377 #define _M128(a) _mm_castsi128_ps(a)
378 //here the most performance effective implementation is compiler and 32/64 bits build dependent
379 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) )
381 #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
382 #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
383 #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
385 //for 32bit gcc and Microsoft compilers builds
386 #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
387 #define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp)
388 #define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
390 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
392 #define return64(a) _M64(res64,a); return res64;
393 #define return64f(a) _M64f(res64,a); return res64;
395 #define _Ui64(a) (*(uint64_t*)&(a))
396 #define _UNSIGNED_T(a) u ## a
398 #define _SIGNBIT64 ((uint64_t)1 << 63)
399 #define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6))
400 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
402 #define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
403 #define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
405 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
406 #define __constrange(min,max) const
407 #define __transfersize(size)
408 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
411 //*************************************************************************
412 //*************************************************************************
413 //********* Functions declarations as declared in original arm_neon.h *****
414 //*************************************************************************
415 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
416 int8x8_t
vadd_s8(int8x8_t a
, int8x8_t b
); // VADD.I8 d0,d0,d0
417 int16x4_t
vadd_s16(int16x4_t a
, int16x4_t b
); // VADD.I16 d0,d0,d0
418 int32x2_t
vadd_s32(int32x2_t a
, int32x2_t b
); // VADD.I32 d0,d0,d0
419 int64x1_t
vadd_s64(int64x1_t a
, int64x1_t b
); // VADD.I64 d0,d0,d0
420 float32x2_t
vadd_f32(float32x2_t a
, float32x2_t b
); // VADD.F32 d0,d0,d0
421 uint8x8_t
vadd_u8(uint8x8_t a
, uint8x8_t b
); // VADD.I8 d0,d0,d0
422 uint16x4_t
vadd_u16(uint16x4_t a
, uint16x4_t b
); // VADD.I16 d0,d0,d0
423 uint32x2_t
vadd_u32(uint32x2_t a
, uint32x2_t b
); // VADD.I32 d0,d0,d0
424 uint64x1_t
vadd_u64(uint64x1_t a
, uint64x1_t b
); // VADD.I64 d0,d0,d0
425 int8x16_t
vaddq_s8(int8x16_t a
, int8x16_t b
); // VADD.I8 q0,q0,q0
426 int16x8_t
vaddq_s16(int16x8_t a
, int16x8_t b
); // VADD.I16 q0,q0,q0
427 int32x4_t
vaddq_s32(int32x4_t a
, int32x4_t b
); // VADD.I32 q0,q0,q0
428 int64x2_t
vaddq_s64(int64x2_t a
, int64x2_t b
); // VADD.I64 q0,q0,q0
429 float32x4_t
vaddq_f32(float32x4_t a
, float32x4_t b
); // VADD.F32 q0,q0,q0
430 uint8x16_t
vaddq_u8(uint8x16_t a
, uint8x16_t b
); // VADD.I8 q0,q0,q0
431 uint16x8_t
vaddq_u16(uint16x8_t a
, uint16x8_t b
); // VADD.I16 q0,q0,q0
432 uint32x4_t
vaddq_u32(uint32x4_t a
, uint32x4_t b
); // VADD.I32 q0,q0,q0
433 uint64x2_t
vaddq_u64(uint64x2_t a
, uint64x2_t b
); // VADD.I64 q0,q0,q0
434 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
435 int16x8_t
vaddl_s8(int8x8_t a
, int8x8_t b
); // VADDL.S8 q0,d0,d0
436 int32x4_t
vaddl_s16(int16x4_t a
, int16x4_t b
); // VADDL.S16 q0,d0,d0
437 int64x2_t
vaddl_s32(int32x2_t a
, int32x2_t b
); // VADDL.S32 q0,d0,d0
438 uint16x8_t
vaddl_u8(uint8x8_t a
, uint8x8_t b
); // VADDL.U8 q0,d0,d0
439 uint32x4_t
vaddl_u16(uint16x4_t a
, uint16x4_t b
); // VADDL.U16 q0,d0,d0
440 uint64x2_t
vaddl_u32(uint32x2_t a
, uint32x2_t b
); // VADDL.U32 q0,d0,d0
441 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
442 int16x8_t
vaddw_s8(int16x8_t a
, int8x8_t b
); // VADDW.S8 q0,q0,d0
443 int32x4_t
vaddw_s16(int32x4_t a
, int16x4_t b
); // VADDW.S16 q0,q0,d0
444 int64x2_t
vaddw_s32(int64x2_t a
, int32x2_t b
); // VADDW.S32 q0,q0,d0
445 uint16x8_t
vaddw_u8(uint16x8_t a
, uint8x8_t b
); // VADDW.U8 q0,q0,d0
446 uint32x4_t
vaddw_u16(uint32x4_t a
, uint16x4_t b
); // VADDW.U16 q0,q0,d0
447 uint64x2_t
vaddw_u32(uint64x2_t a
, uint32x2_t b
); // VADDW.U32 q0,q0,d0
448 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
449 int8x8_t
vhadd_s8(int8x8_t a
, int8x8_t b
); // VHADD.S8 d0,d0,d0
450 int16x4_t
vhadd_s16(int16x4_t a
, int16x4_t b
); // VHADD.S16 d0,d0,d0
451 int32x2_t
vhadd_s32(int32x2_t a
, int32x2_t b
); // VHADD.S32 d0,d0,d0
452 uint8x8_t
vhadd_u8(uint8x8_t a
, uint8x8_t b
); // VHADD.U8 d0,d0,d0
453 uint16x4_t
vhadd_u16(uint16x4_t a
, uint16x4_t b
); // VHADD.U16 d0,d0,d0
454 uint32x2_t
vhadd_u32(uint32x2_t a
, uint32x2_t b
); // VHADD.U32 d0,d0,d0
455 int8x16_t
vhaddq_s8(int8x16_t a
, int8x16_t b
); // VHADD.S8 q0,q0,q0
456 int16x8_t
vhaddq_s16(int16x8_t a
, int16x8_t b
); // VHADD.S16 q0,q0,q0
457 int32x4_t
vhaddq_s32(int32x4_t a
, int32x4_t b
); // VHADD.S32 q0,q0,q0
458 uint8x16_t
vhaddq_u8(uint8x16_t a
, uint8x16_t b
); // VHADD.U8 q0,q0,q0
459 uint16x8_t
vhaddq_u16(uint16x8_t a
, uint16x8_t b
); // VHADD.U16 q0,q0,q0
460 uint32x4_t
vhaddq_u32(uint32x4_t a
, uint32x4_t b
); // VHADD.U32 q0,q0,q0
461 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
462 int8x8_t
vrhadd_s8(int8x8_t a
, int8x8_t b
); // VRHADD.S8 d0,d0,d0
463 int16x4_t
vrhadd_s16(int16x4_t a
, int16x4_t b
); // VRHADD.S16 d0,d0,d0
464 int32x2_t
vrhadd_s32(int32x2_t a
, int32x2_t b
); // VRHADD.S32 d0,d0,d0
465 uint8x8_t
vrhadd_u8(uint8x8_t a
, uint8x8_t b
); // VRHADD.U8 d0,d0,d0
466 uint16x4_t
vrhadd_u16(uint16x4_t a
, uint16x4_t b
); // VRHADD.U16 d0,d0,d0
467 uint32x2_t
vrhadd_u32(uint32x2_t a
, uint32x2_t b
); // VRHADD.U32 d0,d0,d0
468 int8x16_t
vrhaddq_s8(int8x16_t a
, int8x16_t b
); // VRHADD.S8 q0,q0,q0
469 int16x8_t
vrhaddq_s16(int16x8_t a
, int16x8_t b
); // VRHADD.S16 q0,q0,q0
470 int32x4_t
vrhaddq_s32(int32x4_t a
, int32x4_t b
); // VRHADD.S32 q0,q0,q0
471 uint8x16_t
vrhaddq_u8(uint8x16_t a
, uint8x16_t b
); // VRHADD.U8 q0,q0,q0
472 uint16x8_t
vrhaddq_u16(uint16x8_t a
, uint16x8_t b
); // VRHADD.U16 q0,q0,q0
473 uint32x4_t
vrhaddq_u32(uint32x4_t a
, uint32x4_t b
); // VRHADD.U32 q0,q0,q0
474 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
475 int8x8_t
vqadd_s8(int8x8_t a
, int8x8_t b
); // VQADD.S8 d0,d0,d0
476 int16x4_t
vqadd_s16(int16x4_t a
, int16x4_t b
); // VQADD.S16 d0,d0,d0
477 int32x2_t
vqadd_s32(int32x2_t a
, int32x2_t b
); // VQADD.S32 d0,d0,d0
478 int64x1_t
vqadd_s64(int64x1_t a
, int64x1_t b
); // VQADD.S64 d0,d0,d0
479 uint8x8_t
vqadd_u8(uint8x8_t a
, uint8x8_t b
); // VQADD.U8 d0,d0,d0
480 uint16x4_t
vqadd_u16(uint16x4_t a
, uint16x4_t b
); // VQADD.U16 d0,d0,d0
481 uint32x2_t
vqadd_u32(uint32x2_t a
, uint32x2_t b
); // VQADD.U32 d0,d0,d0
482 uint64x1_t
vqadd_u64(uint64x1_t a
, uint64x1_t b
); // VQADD.U64 d0,d0,d0
483 int8x16_t
vqaddq_s8(int8x16_t a
, int8x16_t b
); // VQADD.S8 q0,q0,q0
484 int16x8_t
vqaddq_s16(int16x8_t a
, int16x8_t b
); // VQADD.S16 q0,q0,q0
485 int32x4_t
vqaddq_s32(int32x4_t a
, int32x4_t b
); // VQADD.S32 q0,q0,q0
486 int64x2_t
vqaddq_s64(int64x2_t a
, int64x2_t b
); // VQADD.S64 q0,q0,q0
487 uint8x16_t
vqaddq_u8(uint8x16_t a
, uint8x16_t b
); // VQADD.U8 q0,q0,q0
488 uint16x8_t
vqaddq_u16(uint16x8_t a
, uint16x8_t b
); // VQADD.U16 q0,q0,q0
489 uint32x4_t
vqaddq_u32(uint32x4_t a
, uint32x4_t b
); // VQADD.U32 q0,q0,q0
490 uint64x2_t
vqaddq_u64(uint64x2_t a
, uint64x2_t b
); // VQADD.U64 q0,q0,q0
491 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
492 int8x8_t
vaddhn_s16(int16x8_t a
, int16x8_t b
); // VADDHN.I16 d0,q0,q0
493 int16x4_t
vaddhn_s32(int32x4_t a
, int32x4_t b
); // VADDHN.I32 d0,q0,q0
494 int32x2_t
vaddhn_s64(int64x2_t a
, int64x2_t b
); // VADDHN.I64 d0,q0,q0
495 uint8x8_t
vaddhn_u16(uint16x8_t a
, uint16x8_t b
); // VADDHN.I16 d0,q0,q0
496 uint16x4_t
vaddhn_u32(uint32x4_t a
, uint32x4_t b
); // VADDHN.I32 d0,q0,q0
497 uint32x2_t
vaddhn_u64(uint64x2_t a
, uint64x2_t b
); // VADDHN.I64 d0,q0,q0
498 //Vector rounding add high half: vraddhn
499 int8x8_t
vraddhn_s16(int16x8_t a
, int16x8_t b
); // VRADDHN.I16 d0,q0,q0
500 int16x4_t
vraddhn_s32(int32x4_t a
, int32x4_t b
); // VRADDHN.I32 d0,q0,q0
501 int32x2_t
vraddhn_s64(int64x2_t a
, int64x2_t b
); // VRADDHN.I64 d0,q0,q0
502 uint8x8_t
vraddhn_u16(uint16x8_t a
, uint16x8_t b
); // VRADDHN.I16 d0,q0,q0
503 uint16x4_t
vraddhn_u32(uint32x4_t a
, uint32x4_t b
); // VRADDHN.I32 d0,q0,q0
504 uint32x2_t
vraddhn_u64(uint64x2_t a
, uint64x2_t b
); // VRADDHN.I64 d0,q0,q0
506 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
507 int8x8_t
vmul_s8(int8x8_t a
, int8x8_t b
); // VMUL.I8 d0,d0,d0
508 int16x4_t
vmul_s16(int16x4_t a
, int16x4_t b
); // VMUL.I16 d0,d0,d0
509 int32x2_t
vmul_s32(int32x2_t a
, int32x2_t b
); // VMUL.I32 d0,d0,d0
510 float32x2_t
vmul_f32(float32x2_t a
, float32x2_t b
); // VMUL.F32 d0,d0,d0
511 uint8x8_t
vmul_u8(uint8x8_t a
, uint8x8_t b
); // VMUL.I8 d0,d0,d0
512 uint16x4_t
vmul_u16(uint16x4_t a
, uint16x4_t b
); // VMUL.I16 d0,d0,d0
513 uint32x2_t
vmul_u32(uint32x2_t a
, uint32x2_t b
); // VMUL.I32 d0,d0,d0
514 poly8x8_t
vmul_p8(poly8x8_t a
, poly8x8_t b
); // VMUL.P8 d0,d0,d0
515 int8x16_t
vmulq_s8(int8x16_t a
, int8x16_t b
); // VMUL.I8 q0,q0,q0
516 int16x8_t
vmulq_s16(int16x8_t a
, int16x8_t b
); // VMUL.I16 q0,q0,q0
517 int32x4_t
vmulq_s32(int32x4_t a
, int32x4_t b
); // VMUL.I32 q0,q0,q0
518 float32x4_t
vmulq_f32(float32x4_t a
, float32x4_t b
); // VMUL.F32 q0,q0,q0
519 uint8x16_t
vmulq_u8(uint8x16_t a
, uint8x16_t b
); // VMUL.I8 q0,q0,q0
520 uint16x8_t
vmulq_u16(uint16x8_t a
, uint16x8_t b
); // VMUL.I16 q0,q0,q0
521 uint32x4_t
vmulq_u32(uint32x4_t a
, uint32x4_t b
); // VMUL.I32 q0,q0,q0
522 poly8x16_t
vmulq_p8(poly8x16_t a
, poly8x16_t b
); // VMUL.P8 q0,q0,q0
524 int16x4_t
vmul_lane_s16 (int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
);
525 int32x2_t
vmul_lane_s32 (int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
);
526 float32x2_t
vmul_lane_f32 (float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
);
527 uint16x4_t
vmul_lane_u16 (uint16x4_t a
, uint16x4_t b
, __constrange(0,3) int c
);
528 uint32x2_t
vmul_lane_u32 (uint32x2_t a
, uint32x2_t b
, __constrange(0,1) int c
);
529 int16x8_t
vmulq_lane_s16 (int16x8_t a
, int16x4_t b
, __constrange(0,3) int c
);
530 int32x4_t
vmulq_lane_s32 (int32x4_t a
, int32x2_t b
, __constrange(0,1) int c
);
531 float32x4_t
vmulq_lane_f32 (float32x4_t a
, float32x2_t b
, __constrange(0,1) int c
);
532 uint16x8_t
vmulq_lane_u16 (uint16x8_t a
, uint16x4_t b
, __constrange(0,3) int c
);
533 uint32x4_t
vmulq_lane_u32 (uint32x4_t a
, uint32x2_t b
, __constrange(0,1) int c
);
534 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
535 int8x8_t
vmla_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VMLA.I8 d0,d0,d0
536 int16x4_t
vmla_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VMLA.I16 d0,d0,d0
537 int32x2_t
vmla_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VMLA.I32 d0,d0,d0
538 float32x2_t
vmla_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
); // VMLA.F32 d0,d0,d0
539 uint8x8_t
vmla_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLA.I8 d0,d0,d0
540 uint16x4_t
vmla_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLA.I16 d0,d0,d0
541 uint32x2_t
vmla_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLA.I32 d0,d0,d0
542 int8x16_t
vmlaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VMLA.I8 q0,q0,q0
543 int16x8_t
vmlaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VMLA.I16 q0,q0,q0
544 int32x4_t
vmlaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VMLA.I32 q0,q0,q0
545 float32x4_t
vmlaq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
); // VMLA.F32 q0,q0,q0
546 uint8x16_t
vmlaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VMLA.I8 q0,q0,q0
547 uint16x8_t
vmlaq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VMLA.I16 q0,q0,q0
548 uint32x4_t
vmlaq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VMLA.I32 q0,q0,q0
549 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
550 int16x8_t
vmlal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VMLAL.S8 q0,d0,d0
551 int32x4_t
vmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VMLAL.S16 q0,d0,d0
552 int64x2_t
vmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VMLAL.S32 q0,d0,d0
553 uint16x8_t
vmlal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLAL.U8 q0,d0,d0
554 uint32x4_t
vmlal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLAL.U16 q0,d0,d0
555 uint64x2_t
vmlal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLAL.U32 q0,d0,d0
556 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
557 int8x8_t
vmls_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VMLS.I8 d0,d0,d0
558 int16x4_t
vmls_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VMLS.I16 d0,d0,d0
559 int32x2_t
vmls_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VMLS.I32 d0,d0,d0
560 float32x2_t
vmls_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
); // VMLS.F32 d0,d0,d0
561 uint8x8_t
vmls_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLS.I8 d0,d0,d0
562 uint16x4_t
vmls_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLS.I16 d0,d0,d0
563 uint32x2_t
vmls_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLS.I32 d0,d0,d0
564 int8x16_t
vmlsq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VMLS.I8 q0,q0,q0
565 int16x8_t
vmlsq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VMLS.I16 q0,q0,q0
566 int32x4_t
vmlsq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VMLS.I32 q0,q0,q0
567 float32x4_t
vmlsq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
); // VMLS.F32 q0,q0,q0
568 uint8x16_t
vmlsq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VMLS.I8 q0,q0,q0
569 uint16x8_t
vmlsq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VMLS.I16 q0,q0,q0
570 uint32x4_t
vmlsq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VMLS.I32 q0,q0,q0
571 //Vector multiply subtract long
572 int16x8_t
vmlsl_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VMLSL.S8 q0,d0,d0
573 int32x4_t
vmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VMLSL.S16 q0,d0,d0
574 int64x2_t
vmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VMLSL.S32 q0,d0,d0
575 uint16x8_t
vmlsl_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLSL.U8 q0,d0,d0
576 uint32x4_t
vmlsl_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLSL.U16 q0,d0,d0
577 uint64x2_t
vmlsl_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLSL.U32 q0,d0,d0
578 //Vector saturating doubling multiply high
579 int16x4_t
vqdmulh_s16(int16x4_t a
, int16x4_t b
); // VQDMULH.S16 d0,d0,d0
580 int32x2_t
vqdmulh_s32(int32x2_t a
, int32x2_t b
); // VQDMULH.S32 d0,d0,d0
581 int16x8_t
vqdmulhq_s16(int16x8_t a
, int16x8_t b
); // VQDMULH.S16 q0,q0,q0
582 int32x4_t
vqdmulhq_s32(int32x4_t a
, int32x4_t b
); // VQDMULH.S32 q0,q0,q0
583 //Vector saturating rounding doubling multiply high
584 int16x4_t
vqrdmulh_s16(int16x4_t a
, int16x4_t b
); // VQRDMULH.S16 d0,d0,d0
585 int32x2_t
vqrdmulh_s32(int32x2_t a
, int32x2_t b
); // VQRDMULH.S32 d0,d0,d0
586 int16x8_t
vqrdmulhq_s16(int16x8_t a
, int16x8_t b
); // VQRDMULH.S16 q0,q0,q0
587 int32x4_t
vqrdmulhq_s32(int32x4_t a
, int32x4_t b
); // VQRDMULH.S32 q0,q0,q0
588 //Vector saturating doubling multiply accumulate long
589 int32x4_t
vqdmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VQDMLAL.S16 q0,d0,d0
590 int64x2_t
vqdmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VQDMLAL.S32 q0,d0,d0
591 //Vector saturating doubling multiply subtract long
592 int32x4_t
vqdmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VQDMLSL.S16 q0,d0,d0
593 int64x2_t
vqdmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VQDMLSL.S32 q0,d0,d0
594 //Vector long multiply
595 int16x8_t
vmull_s8(int8x8_t a
, int8x8_t b
); // VMULL.S8 q0,d0,d0
596 int32x4_t
vmull_s16(int16x4_t a
, int16x4_t b
); // VMULL.S16 q0,d0,d0
597 int64x2_t
vmull_s32(int32x2_t a
, int32x2_t b
); // VMULL.S32 q0,d0,d0
598 uint16x8_t
vmull_u8(uint8x8_t a
, uint8x8_t b
); // VMULL.U8 q0,d0,d0
599 uint32x4_t
vmull_u16(uint16x4_t a
, uint16x4_t b
); // VMULL.U16 q0,d0,d0
600 uint64x2_t
vmull_u32(uint32x2_t a
, uint32x2_t b
); // VMULL.U32 q0,d0,d0
601 poly16x8_t
vmull_p8(poly8x8_t a
, poly8x8_t b
); // VMULL.P8 q0,d0,d0
602 //Vector saturating doubling long multiply
603 int32x4_t
vqdmull_s16(int16x4_t a
, int16x4_t b
); // VQDMULL.S16 q0,d0,d0
604 int64x2_t
vqdmull_s32(int32x2_t a
, int32x2_t b
); // VQDMULL.S32 q0,d0,d0
607 int8x8_t
vsub_s8(int8x8_t a
, int8x8_t b
); // VSUB.I8 d0,d0,d0
608 int16x4_t
vsub_s16(int16x4_t a
, int16x4_t b
); // VSUB.I16 d0,d0,d0
609 int32x2_t
vsub_s32(int32x2_t a
, int32x2_t b
); // VSUB.I32 d0,d0,d0
610 int64x1_t
vsub_s64(int64x1_t a
, int64x1_t b
); // VSUB.I64 d0,d0,d0
611 float32x2_t
vsub_f32(float32x2_t a
, float32x2_t b
); // VSUB.F32 d0,d0,d0
612 uint8x8_t
vsub_u8(uint8x8_t a
, uint8x8_t b
); // VSUB.I8 d0,d0,d0
613 uint16x4_t
vsub_u16(uint16x4_t a
, uint16x4_t b
); // VSUB.I16 d0,d0,d0
614 uint32x2_t
vsub_u32(uint32x2_t a
, uint32x2_t b
); // VSUB.I32 d0,d0,d0
615 uint64x1_t
vsub_u64(uint64x1_t a
, uint64x1_t b
); // VSUB.I64 d0,d0,d0
616 int8x16_t
vsubq_s8(int8x16_t a
, int8x16_t b
); // VSUB.I8 q0,q0,q0
617 int16x8_t
vsubq_s16(int16x8_t a
, int16x8_t b
); // VSUB.I16 q0,q0,q0
618 int32x4_t
vsubq_s32(int32x4_t a
, int32x4_t b
); // VSUB.I32 q0,q0,q0
619 int64x2_t
vsubq_s64(int64x2_t a
, int64x2_t b
); // VSUB.I64 q0,q0,q0
620 float32x4_t
vsubq_f32(float32x4_t a
, float32x4_t b
); // VSUB.F32 q0,q0,q0
621 uint8x16_t
vsubq_u8(uint8x16_t a
, uint8x16_t b
); // VSUB.I8 q0,q0,q0
622 uint16x8_t
vsubq_u16(uint16x8_t a
, uint16x8_t b
); // VSUB.I16 q0,q0,q0
623 uint32x4_t
vsubq_u32(uint32x4_t a
, uint32x4_t b
); // VSUB.I32 q0,q0,q0
624 uint64x2_t
vsubq_u64(uint64x2_t a
, uint64x2_t b
); // VSUB.I64 q0,q0,q0
625 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
626 int16x8_t
vsubl_s8(int8x8_t a
, int8x8_t b
); // VSUBL.S8 q0,d0,d0
627 int32x4_t
vsubl_s16(int16x4_t a
, int16x4_t b
); // VSUBL.S16 q0,d0,d0
628 int64x2_t
vsubl_s32(int32x2_t a
, int32x2_t b
); // VSUBL.S32 q0,d0,d0
629 uint16x8_t
vsubl_u8(uint8x8_t a
, uint8x8_t b
); // VSUBL.U8 q0,d0,d0
630 uint32x4_t
vsubl_u16(uint16x4_t a
, uint16x4_t b
); // VSUBL.U16 q0,d0,d0
631 uint64x2_t
vsubl_u32(uint32x2_t a
, uint32x2_t b
); // VSUBL.U32 q0,d0,d0
632 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
633 int16x8_t
vsubw_s8(int16x8_t a
, int8x8_t b
); // VSUBW.S8 q0,q0,d0
634 int32x4_t
vsubw_s16(int32x4_t a
, int16x4_t b
); // VSUBW.S16 q0,q0,d0
635 int64x2_t
vsubw_s32(int64x2_t a
, int32x2_t b
); // VSUBW.S32 q0,q0,d0
636 uint16x8_t
vsubw_u8(uint16x8_t a
, uint8x8_t b
); // VSUBW.U8 q0,q0,d0
637 uint32x4_t
vsubw_u16(uint32x4_t a
, uint16x4_t b
); // VSUBW.U16 q0,q0,d0
638 uint64x2_t
vsubw_u32(uint64x2_t a
, uint32x2_t b
); // VSUBW.U32 q0,q0,d0
639 //Vector saturating subtract
640 int8x8_t
vqsub_s8(int8x8_t a
, int8x8_t b
); // VQSUB.S8 d0,d0,d0
641 int16x4_t
vqsub_s16(int16x4_t a
, int16x4_t b
); // VQSUB.S16 d0,d0,d0
642 int32x2_t
vqsub_s32(int32x2_t a
, int32x2_t b
); // VQSUB.S32 d0,d0,d0
643 int64x1_t
vqsub_s64(int64x1_t a
, int64x1_t b
); // VQSUB.S64 d0,d0,d0
644 uint8x8_t
vqsub_u8(uint8x8_t a
, uint8x8_t b
); // VQSUB.U8 d0,d0,d0
645 uint16x4_t
vqsub_u16(uint16x4_t a
, uint16x4_t b
); // VQSUB.U16 d0,d0,d0
646 uint32x2_t
vqsub_u32(uint32x2_t a
, uint32x2_t b
); // VQSUB.U32 d0,d0,d0
647 uint64x1_t
vqsub_u64(uint64x1_t a
, uint64x1_t b
); // VQSUB.U64 d0,d0,d0
648 int8x16_t
vqsubq_s8(int8x16_t a
, int8x16_t b
); // VQSUB.S8 q0,q0,q0
649 int16x8_t
vqsubq_s16(int16x8_t a
, int16x8_t b
); // VQSUB.S16 q0,q0,q0
650 int32x4_t
vqsubq_s32(int32x4_t a
, int32x4_t b
); // VQSUB.S32 q0,q0,q0
651 int64x2_t
vqsubq_s64(int64x2_t a
, int64x2_t b
); // VQSUB.S64 q0,q0,q0
652 uint8x16_t
vqsubq_u8(uint8x16_t a
, uint8x16_t b
); // VQSUB.U8 q0,q0,q0
653 uint16x8_t
vqsubq_u16(uint16x8_t a
, uint16x8_t b
); // VQSUB.U16 q0,q0,q0
654 uint32x4_t
vqsubq_u32(uint32x4_t a
, uint32x4_t b
); // VQSUB.U32 q0,q0,q0
655 uint64x2_t
vqsubq_u64(uint64x2_t a
, uint64x2_t b
); // VQSUB.U64 q0,q0,q0
656 //Vector halving subtract
657 int8x8_t
vhsub_s8(int8x8_t a
, int8x8_t b
); // VHSUB.S8 d0,d0,d0
658 int16x4_t
vhsub_s16(int16x4_t a
, int16x4_t b
); // VHSUB.S16 d0,d0,d0
659 int32x2_t
vhsub_s32(int32x2_t a
, int32x2_t b
); // VHSUB.S32 d0,d0,d0
660 uint8x8_t
vhsub_u8(uint8x8_t a
, uint8x8_t b
); // VHSUB.U8 d0,d0,d0
661 uint16x4_t
vhsub_u16(uint16x4_t a
, uint16x4_t b
); // VHSUB.U16 d0,d0,d0
662 uint32x2_t
vhsub_u32(uint32x2_t a
, uint32x2_t b
); // VHSUB.U32 d0,d0,d0
663 int8x16_t
vhsubq_s8(int8x16_t a
, int8x16_t b
); // VHSUB.S8 q0,q0,q0
664 int16x8_t
vhsubq_s16(int16x8_t a
, int16x8_t b
); // VHSUB.S16 q0,q0,q0
665 int32x4_t
vhsubq_s32(int32x4_t a
, int32x4_t b
); // VHSUB.S32 q0,q0,q0
666 uint8x16_t
vhsubq_u8(uint8x16_t a
, uint8x16_t b
); // VHSUB.U8 q0,q0,q0
667 uint16x8_t
vhsubq_u16(uint16x8_t a
, uint16x8_t b
); // VHSUB.U16 q0,q0,q0
668 uint32x4_t
vhsubq_u32(uint32x4_t a
, uint32x4_t b
); // VHSUB.U32 q0,q0,q0
669 //Vector subtract high half
670 int8x8_t
vsubhn_s16(int16x8_t a
, int16x8_t b
); // VSUBHN.I16 d0,q0,q0
671 int16x4_t
vsubhn_s32(int32x4_t a
, int32x4_t b
); // VSUBHN.I32 d0,q0,q0
672 int32x2_t
vsubhn_s64(int64x2_t a
, int64x2_t b
); // VSUBHN.I64 d0,q0,q0
673 uint8x8_t
vsubhn_u16(uint16x8_t a
, uint16x8_t b
); // VSUBHN.I16 d0,q0,q0
674 uint16x4_t
vsubhn_u32(uint32x4_t a
, uint32x4_t b
); // VSUBHN.I32 d0,q0,q0
675 uint32x2_t
vsubhn_u64(uint64x2_t a
, uint64x2_t b
); // VSUBHN.I64 d0,q0,q0
676 //Vector rounding subtract high half
677 int8x8_t
vrsubhn_s16(int16x8_t a
, int16x8_t b
); // VRSUBHN.I16 d0,q0,q0
678 int16x4_t
vrsubhn_s32(int32x4_t a
, int32x4_t b
); // VRSUBHN.I32 d0,q0,q0
679 int32x2_t
vrsubhn_s64(int64x2_t a
, int64x2_t b
); // VRSUBHN.I64 d0,q0,q0
680 uint8x8_t
vrsubhn_u16(uint16x8_t a
, uint16x8_t b
); // VRSUBHN.I16 d0,q0,q0
681 uint16x4_t
vrsubhn_u32(uint32x4_t a
, uint32x4_t b
); // VRSUBHN.I32 d0,q0,q0
682 uint32x2_t
vrsubhn_u64(uint64x2_t a
, uint64x2_t b
); // VRSUBHN.I64 d0,q0,q0
684 //Vector compare equal
685 uint8x8_t
vceq_s8(int8x8_t a
, int8x8_t b
); // VCEQ.I8 d0, d0, d0
686 uint16x4_t
vceq_s16(int16x4_t a
, int16x4_t b
); // VCEQ.I16 d0, d0, d0
687 uint32x2_t
vceq_s32(int32x2_t a
, int32x2_t b
); // VCEQ.I32 d0, d0, d0
688 uint32x2_t
vceq_f32(float32x2_t a
, float32x2_t b
); // VCEQ.F32 d0, d0, d0
689 uint8x8_t
vceq_u8(uint8x8_t a
, uint8x8_t b
); // VCEQ.I8 d0, d0, d0
690 uint16x4_t
vceq_u16(uint16x4_t a
, uint16x4_t b
); // VCEQ.I16 d0, d0, d0
691 uint32x2_t
vceq_u32(uint32x2_t a
, uint32x2_t b
); // VCEQ.I32 d0, d0, d0
692 uint8x8_t
vceq_p8(poly8x8_t a
, poly8x8_t b
); // VCEQ.I8 d0, d0, d0
693 uint8x16_t
vceqq_s8(int8x16_t a
, int8x16_t b
); // VCEQ.I8 q0, q0, q0
694 uint16x8_t
vceqq_s16(int16x8_t a
, int16x8_t b
); // VCEQ.I16 q0, q0, q0
695 uint32x4_t
vceqq_s32(int32x4_t a
, int32x4_t b
); // VCEQ.I32 q0, q0, q0
696 uint32x4_t
vceqq_f32(float32x4_t a
, float32x4_t b
); // VCEQ.F32 q0, q0, q0
697 uint8x16_t
vceqq_u8(uint8x16_t a
, uint8x16_t b
); // VCEQ.I8 q0, q0, q0
698 uint16x8_t
vceqq_u16(uint16x8_t a
, uint16x8_t b
); // VCEQ.I16 q0, q0, q0
699 uint32x4_t
vceqq_u32(uint32x4_t a
, uint32x4_t b
); // VCEQ.I32 q0, q0, q0
700 uint8x16_t
vceqq_p8(poly8x16_t a
, poly8x16_t b
); // VCEQ.I8 q0, q0, q0
701 //Vector compare greater-than or equal
702 uint8x8_t
vcge_s8(int8x8_t a
, int8x8_t b
); // VCGE.S8 d0, d0, d0
703 uint16x4_t
vcge_s16(int16x4_t a
, int16x4_t b
); // VCGE.S16 d0, d0, d0
704 uint32x2_t
vcge_s32(int32x2_t a
, int32x2_t b
); // VCGE.S32 d0, d0, d0
705 uint32x2_t
vcge_f32(float32x2_t a
, float32x2_t b
); // VCGE.F32 d0, d0, d0
706 uint8x8_t
vcge_u8(uint8x8_t a
, uint8x8_t b
); // VCGE.U8 d0, d0, d0
707 uint16x4_t
vcge_u16(uint16x4_t a
, uint16x4_t b
); // VCGE.U16 d0, d0, d0
708 uint32x2_t
vcge_u32(uint32x2_t a
, uint32x2_t b
); // VCGE.U32 d0, d0, d0
709 uint8x16_t
vcgeq_s8(int8x16_t a
, int8x16_t b
); // VCGE.S8 q0, q0, q0
710 uint16x8_t
vcgeq_s16(int16x8_t a
, int16x8_t b
); // VCGE.S16 q0, q0, q0
711 uint32x4_t
vcgeq_s32(int32x4_t a
, int32x4_t b
); // VCGE.S32 q0, q0, q0
712 uint32x4_t
vcgeq_f32(float32x4_t a
, float32x4_t b
); // VCGE.F32 q0, q0, q0
713 uint8x16_t
vcgeq_u8(uint8x16_t a
, uint8x16_t b
); // VCGE.U8 q0, q0, q0
714 uint16x8_t
vcgeq_u16(uint16x8_t a
, uint16x8_t b
); // VCGE.U16 q0, q0, q0
715 uint32x4_t
vcgeq_u32(uint32x4_t a
, uint32x4_t b
); // VCGE.U32 q0, q0, q0
716 //Vector compare less-than or equal
717 uint8x8_t
vcle_s8(int8x8_t a
, int8x8_t b
); // VCGE.S8 d0, d0, d0
718 uint16x4_t
vcle_s16(int16x4_t a
, int16x4_t b
); // VCGE.S16 d0, d0, d0
719 uint32x2_t
vcle_s32(int32x2_t a
, int32x2_t b
); // VCGE.S32 d0, d0, d0
720 uint32x2_t
vcle_f32(float32x2_t a
, float32x2_t b
); // VCGE.F32 d0, d0, d0
721 uint8x8_t
vcle_u8(uint8x8_t a
, uint8x8_t b
); // VCGE.U8 d0, d0, d0
722 uint16x4_t
vcle_u16(uint16x4_t a
, uint16x4_t b
); // VCGE.U16 d0, d0, d0
723 uint32x2_t
vcle_u32(uint32x2_t a
, uint32x2_t b
); // VCGE.U32 d0, d0, d0
724 uint8x16_t
vcleq_s8(int8x16_t a
, int8x16_t b
); // VCGE.S8 q0, q0, q0
725 uint16x8_t
vcleq_s16(int16x8_t a
, int16x8_t b
); // VCGE.S16 q0, q0, q0
726 uint32x4_t
vcleq_s32(int32x4_t a
, int32x4_t b
); // VCGE.S32 q0, q0, q0
727 uint32x4_t
vcleq_f32(float32x4_t a
, float32x4_t b
); // VCGE.F32 q0, q0, q0
728 uint8x16_t
vcleq_u8(uint8x16_t a
, uint8x16_t b
); // VCGE.U8 q0, q0, q0
729 uint16x8_t
vcleq_u16(uint16x8_t a
, uint16x8_t b
); // VCGE.U16 q0, q0, q0
730 uint32x4_t
vcleq_u32(uint32x4_t a
, uint32x4_t b
); // VCGE.U32 q0, q0, q0
731 //Vector compare greater-than
732 uint8x8_t
vcgt_s8(int8x8_t a
, int8x8_t b
); // VCGT.S8 d0, d0, d0
733 uint16x4_t
vcgt_s16(int16x4_t a
, int16x4_t b
); // VCGT.S16 d0, d0, d0
734 uint32x2_t
vcgt_s32(int32x2_t a
, int32x2_t b
); // VCGT.S32 d0, d0, d0
735 uint32x2_t
vcgt_f32(float32x2_t a
, float32x2_t b
); // VCGT.F32 d0, d0, d0
736 uint8x8_t
vcgt_u8(uint8x8_t a
, uint8x8_t b
); // VCGT.U8 d0, d0, d0
737 uint16x4_t
vcgt_u16(uint16x4_t a
, uint16x4_t b
); // VCGT.U16 d0, d0, d0
738 uint32x2_t
vcgt_u32(uint32x2_t a
, uint32x2_t b
); // VCGT.U32 d0, d0, d0
739 uint8x16_t
vcgtq_s8(int8x16_t a
, int8x16_t b
); // VCGT.S8 q0, q0, q0
740 uint16x8_t
vcgtq_s16(int16x8_t a
, int16x8_t b
); // VCGT.S16 q0, q0, q0
741 uint32x4_t
vcgtq_s32(int32x4_t a
, int32x4_t b
); // VCGT.S32 q0, q0, q0
742 uint32x4_t
vcgtq_f32(float32x4_t a
, float32x4_t b
); // VCGT.F32 q0, q0, q0
743 uint8x16_t
vcgtq_u8(uint8x16_t a
, uint8x16_t b
); // VCGT.U8 q0, q0, q0
744 uint16x8_t
vcgtq_u16(uint16x8_t a
, uint16x8_t b
); // VCGT.U16 q0, q0, q0
745 uint32x4_t
vcgtq_u32(uint32x4_t a
, uint32x4_t b
); // VCGT.U32 q0, q0, q0
746 //Vector compare less-than
747 uint8x8_t
vclt_s8(int8x8_t a
, int8x8_t b
); // VCGT.S8 d0, d0, d0
748 uint16x4_t
vclt_s16(int16x4_t a
, int16x4_t b
); // VCGT.S16 d0, d0, d0
749 uint32x2_t
vclt_s32(int32x2_t a
, int32x2_t b
); // VCGT.S32 d0, d0, d0
750 uint32x2_t
vclt_f32(float32x2_t a
, float32x2_t b
); // VCGT.F32 d0, d0, d0
751 uint8x8_t
vclt_u8(uint8x8_t a
, uint8x8_t b
); // VCGT.U8 d0, d0, d0
752 uint16x4_t
vclt_u16(uint16x4_t a
, uint16x4_t b
); // VCGT.U16 d0, d0, d0
753 uint32x2_t
vclt_u32(uint32x2_t a
, uint32x2_t b
); // VCGT.U32 d0, d0, d0
754 uint8x16_t
vcltq_s8(int8x16_t a
, int8x16_t b
); // VCGT.S8 q0, q0, q0
755 uint16x8_t
vcltq_s16(int16x8_t a
, int16x8_t b
); // VCGT.S16 q0, q0, q0
756 uint32x4_t
vcltq_s32(int32x4_t a
, int32x4_t b
); // VCGT.S32 q0, q0, q0
757 uint32x4_t
vcltq_f32(float32x4_t a
, float32x4_t b
); // VCGT.F32 q0, q0, q0
758 uint8x16_t
vcltq_u8(uint8x16_t a
, uint8x16_t b
); // VCGT.U8 q0, q0, q0
759 uint16x8_t
vcltq_u16(uint16x8_t a
, uint16x8_t b
); // VCGT.U16 q0, q0, q0
760 uint32x4_t
vcltq_u32(uint32x4_t a
, uint32x4_t b
); // VCGT.U32 q0, q0, q0
761 //Vector compare absolute greater-than or equal
762 uint32x2_t
vcage_f32(float32x2_t a
, float32x2_t b
); // VACGE.F32 d0, d0, d0
763 uint32x4_t
vcageq_f32(float32x4_t a
, float32x4_t b
); // VACGE.F32 q0, q0, q0
764 //Vector compare absolute less-than or equal
765 uint32x2_t
vcale_f32(float32x2_t a
, float32x2_t b
); // VACGE.F32 d0, d0, d0
766 uint32x4_t
vcaleq_f32(float32x4_t a
, float32x4_t b
); // VACGE.F32 q0, q0, q0
767 //Vector compare absolute greater-than
768 uint32x2_t
vcagt_f32(float32x2_t a
, float32x2_t b
); // VACGT.F32 d0, d0, d0
769 uint32x4_t
vcagtq_f32(float32x4_t a
, float32x4_t b
); // VACGT.F32 q0, q0, q0
770 //Vector compare absolute less-than
771 uint32x2_t
vcalt_f32(float32x2_t a
, float32x2_t b
); // VACGT.F32 d0, d0, d0
772 uint32x4_t
vcaltq_f32(float32x4_t a
, float32x4_t b
); // VACGT.F32 q0, q0, q0
774 uint8x8_t
vtst_s8(int8x8_t a
, int8x8_t b
); // VTST.8 d0, d0, d0
775 uint16x4_t
vtst_s16(int16x4_t a
, int16x4_t b
); // VTST.16 d0, d0, d0
776 uint32x2_t
vtst_s32(int32x2_t a
, int32x2_t b
); // VTST.32 d0, d0, d0
777 uint8x8_t
vtst_u8(uint8x8_t a
, uint8x8_t b
); // VTST.8 d0, d0, d0
778 uint16x4_t
vtst_u16(uint16x4_t a
, uint16x4_t b
); // VTST.16 d0, d0, d0
779 uint32x2_t
vtst_u32(uint32x2_t a
, uint32x2_t b
); // VTST.32 d0, d0, d0
780 uint8x8_t
vtst_p8(poly8x8_t a
, poly8x8_t b
); // VTST.8 d0, d0, d0
781 uint8x16_t
vtstq_s8(int8x16_t a
, int8x16_t b
); // VTST.8 q0, q0, q0
782 uint16x8_t
vtstq_s16(int16x8_t a
, int16x8_t b
); // VTST.16 q0, q0, q0
783 uint32x4_t
vtstq_s32(int32x4_t a
, int32x4_t b
); // VTST.32 q0, q0, q0
784 uint8x16_t
vtstq_u8(uint8x16_t a
, uint8x16_t b
); // VTST.8 q0, q0, q0
785 uint16x8_t
vtstq_u16(uint16x8_t a
, uint16x8_t b
); // VTST.16 q0, q0, q0
786 uint32x4_t
vtstq_u32(uint32x4_t a
, uint32x4_t b
); // VTST.32 q0, q0, q0
787 uint8x16_t
vtstq_p8(poly8x16_t a
, poly8x16_t b
); // VTST.8 q0, q0, q0
788 //Absolute difference
789 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
790 int8x8_t
vabd_s8(int8x8_t a
, int8x8_t b
); // VABD.S8 d0,d0,d0
791 int16x4_t
vabd_s16(int16x4_t a
, int16x4_t b
); // VABD.S16 d0,d0,d0
792 int32x2_t
vabd_s32(int32x2_t a
, int32x2_t b
); // VABD.S32 d0,d0,d0
793 uint8x8_t
vabd_u8(uint8x8_t a
, uint8x8_t b
); // VABD.U8 d0,d0,d0
794 uint16x4_t
vabd_u16(uint16x4_t a
, uint16x4_t b
); // VABD.U16 d0,d0,d0
795 uint32x2_t
vabd_u32(uint32x2_t a
, uint32x2_t b
); // VABD.U32 d0,d0,d0
796 float32x2_t
vabd_f32(float32x2_t a
, float32x2_t b
); // VABD.F32 d0,d0,d0
797 int8x16_t
vabdq_s8(int8x16_t a
, int8x16_t b
); // VABD.S8 q0,q0,q0
798 int16x8_t
vabdq_s16(int16x8_t a
, int16x8_t b
); // VABD.S16 q0,q0,q0
799 int32x4_t
vabdq_s32(int32x4_t a
, int32x4_t b
); // VABD.S32 q0,q0,q0
800 uint8x16_t
vabdq_u8(uint8x16_t a
, uint8x16_t b
); // VABD.U8 q0,q0,q0
801 uint16x8_t
vabdq_u16(uint16x8_t a
, uint16x8_t b
); // VABD.U16 q0,q0,q0
802 uint32x4_t
vabdq_u32(uint32x4_t a
, uint32x4_t b
); // VABD.U32 q0,q0,q0
803 float32x4_t
vabdq_f32(float32x4_t a
, float32x4_t b
); // VABD.F32 q0,q0,q0
804 //Absolute difference - long
805 int16x8_t
vabdl_s8(int8x8_t a
, int8x8_t b
); // VABDL.S8 q0,d0,d0
806 int32x4_t
vabdl_s16(int16x4_t a
, int16x4_t b
); // VABDL.S16 q0,d0,d0
807 int64x2_t
vabdl_s32(int32x2_t a
, int32x2_t b
); // VABDL.S32 q0,d0,d0
808 uint16x8_t
vabdl_u8(uint8x8_t a
, uint8x8_t b
); // VABDL.U8 q0,d0,d0
809 uint32x4_t
vabdl_u16(uint16x4_t a
, uint16x4_t b
); // VABDL.U16 q0,d0,d0
810 uint64x2_t
vabdl_u32(uint32x2_t a
, uint32x2_t b
); // VABDL.U32 q0,d0,d0
811 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
812 int8x8_t
vaba_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VABA.S8 d0,d0,d0
813 int16x4_t
vaba_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VABA.S16 d0,d0,d0
814 int32x2_t
vaba_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VABA.S32 d0,d0,d0
815 uint8x8_t
vaba_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VABA.U8 d0,d0,d0
816 uint16x4_t
vaba_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VABA.U16 d0,d0,d0
817 uint32x2_t
vaba_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VABA.U32 d0,d0,d0
818 int8x16_t
vabaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VABA.S8 q0,q0,q0
819 int16x8_t
vabaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VABA.S16 q0,q0,q0
820 int32x4_t
vabaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VABA.S32 q0,q0,q0
821 uint8x16_t
vabaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VABA.U8 q0,q0,q0
822 uint16x8_t
vabaq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VABA.U16 q0,q0,q0
823 uint32x4_t
vabaq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VABA.U32 q0,q0,q0
824 //Absolute difference and accumulate - long
825 int16x8_t
vabal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VABAL.S8 q0,d0,d0
826 int32x4_t
vabal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VABAL.S16 q0,d0,d0
827 int64x2_t
vabal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VABAL.S32 q0,d0,d0
828 uint16x8_t
vabal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VABAL.U8 q0,d0,d0
829 uint32x4_t
vabal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VABAL.U16 q0,d0,d0
830 uint64x2_t
vabal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VABAL.U32 q0,d0,d0
832 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
833 int8x8_t
vmax_s8(int8x8_t a
, int8x8_t b
); // VMAX.S8 d0,d0,d0
834 int16x4_t
vmax_s16(int16x4_t a
, int16x4_t b
); // VMAX.S16 d0,d0,d0
835 int32x2_t
vmax_s32(int32x2_t a
, int32x2_t b
); // VMAX.S32 d0,d0,d0
836 uint8x8_t
vmax_u8(uint8x8_t a
, uint8x8_t b
); // VMAX.U8 d0,d0,d0
837 uint16x4_t
vmax_u16(uint16x4_t a
, uint16x4_t b
); // VMAX.U16 d0,d0,d0
838 uint32x2_t
vmax_u32(uint32x2_t a
, uint32x2_t b
); // VMAX.U32 d0,d0,d0
839 float32x2_t
vmax_f32(float32x2_t a
, float32x2_t b
); // VMAX.F32 d0,d0,d0
840 int8x16_t
vmaxq_s8(int8x16_t a
, int8x16_t b
); // VMAX.S8 q0,q0,q0
841 int16x8_t
vmaxq_s16(int16x8_t a
, int16x8_t b
); // VMAX.S16 q0,q0,q0
842 int32x4_t
vmaxq_s32(int32x4_t a
, int32x4_t b
); // VMAX.S32 q0,q0,q0
843 uint8x16_t
vmaxq_u8(uint8x16_t a
, uint8x16_t b
); // VMAX.U8 q0,q0,q0
844 uint16x8_t
vmaxq_u16(uint16x8_t a
, uint16x8_t b
); // VMAX.U16 q0,q0,q0
845 uint32x4_t
vmaxq_u32(uint32x4_t a
, uint32x4_t b
); // VMAX.U32 q0,q0,q0
846 float32x4_t
vmaxq_f32(float32x4_t a
, float32x4_t b
); // VMAX.F32 q0,q0,q0
847 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
848 int8x8_t
vmin_s8(int8x8_t a
, int8x8_t b
); // VMIN.S8 d0,d0,d0
849 int16x4_t
vmin_s16(int16x4_t a
, int16x4_t b
); // VMIN.S16 d0,d0,d0
850 int32x2_t
vmin_s32(int32x2_t a
, int32x2_t b
); // VMIN.S32 d0,d0,d0
851 uint8x8_t
vmin_u8(uint8x8_t a
, uint8x8_t b
); // VMIN.U8 d0,d0,d0
852 uint16x4_t
vmin_u16(uint16x4_t a
, uint16x4_t b
); // VMIN.U16 d0,d0,d0
853 uint32x2_t
vmin_u32(uint32x2_t a
, uint32x2_t b
); // VMIN.U32 d0,d0,d0
854 float32x2_t
vmin_f32(float32x2_t a
, float32x2_t b
); // VMIN.F32 d0,d0,d0
855 int8x16_t
vminq_s8(int8x16_t a
, int8x16_t b
); // VMIN.S8 q0,q0,q0
856 int16x8_t
vminq_s16(int16x8_t a
, int16x8_t b
); // VMIN.S16 q0,q0,q0
857 int32x4_t
vminq_s32(int32x4_t a
, int32x4_t b
); // VMIN.S32 q0,q0,q0
858 uint8x16_t
vminq_u8(uint8x16_t a
, uint8x16_t b
); // VMIN.U8 q0,q0,q0
859 uint16x8_t
vminq_u16(uint16x8_t a
, uint16x8_t b
); // VMIN.U16 q0,q0,q0
860 uint32x4_t
vminq_u32(uint32x4_t a
, uint32x4_t b
); // VMIN.U32 q0,q0,q0
861 float32x4_t
vminq_f32(float32x4_t a
, float32x4_t b
); // VMIN.F32 q0,q0,q0
864 int8x8_t
vpadd_s8(int8x8_t a
, int8x8_t b
); // VPADD.I8 d0,d0,d0
865 int16x4_t
vpadd_s16(int16x4_t a
, int16x4_t b
); // VPADD.I16 d0,d0,d0
866 int32x2_t
vpadd_s32(int32x2_t a
, int32x2_t b
); // VPADD.I32 d0,d0,d0
867 uint8x8_t
vpadd_u8(uint8x8_t a
, uint8x8_t b
); // VPADD.I8 d0,d0,d0
868 uint16x4_t
vpadd_u16(uint16x4_t a
, uint16x4_t b
); // VPADD.I16 d0,d0,d0
869 uint32x2_t
vpadd_u32(uint32x2_t a
, uint32x2_t b
); // VPADD.I32 d0,d0,d0
870 float32x2_t
vpadd_f32(float32x2_t a
, float32x2_t b
); // VPADD.F32 d0,d0,d0
872 int16x4_t
vpaddl_s8(int8x8_t a
); // VPADDL.S8 d0,d0
873 int32x2_t
vpaddl_s16(int16x4_t a
); // VPADDL.S16 d0,d0
874 int64x1_t
vpaddl_s32(int32x2_t a
); // VPADDL.S32 d0,d0
875 uint16x4_t
vpaddl_u8(uint8x8_t a
); // VPADDL.U8 d0,d0
876 uint32x2_t
vpaddl_u16(uint16x4_t a
); // VPADDL.U16 d0,d0
877 uint64x1_t
vpaddl_u32(uint32x2_t a
); // VPADDL.U32 d0,d0
878 int16x8_t
vpaddlq_s8(int8x16_t a
); // VPADDL.S8 q0,q0
879 int32x4_t
vpaddlq_s16(int16x8_t a
); // VPADDL.S16 q0,q0
880 int64x2_t
vpaddlq_s32(int32x4_t a
); // VPADDL.S32 q0,q0
881 uint16x8_t
vpaddlq_u8(uint8x16_t a
); // VPADDL.U8 q0,q0
882 uint32x4_t
vpaddlq_u16(uint16x8_t a
); // VPADDL.U16 q0,q0
883 uint64x2_t
vpaddlq_u32(uint32x4_t a
); // VPADDL.U32 q0,q0
884 //Long pairwise add and accumulate
885 int16x4_t
vpadal_s8(int16x4_t a
, int8x8_t b
); // VPADAL.S8 d0,d0
886 int32x2_t
vpadal_s16(int32x2_t a
, int16x4_t b
); // VPADAL.S16 d0,d0
887 int64x1_t
vpadal_s32(int64x1_t a
, int32x2_t b
); // VPADAL.S32 d0,d0
888 uint16x4_t
vpadal_u8(uint16x4_t a
, uint8x8_t b
); // VPADAL.U8 d0,d0
889 uint32x2_t
vpadal_u16(uint32x2_t a
, uint16x4_t b
); // VPADAL.U16 d0,d0
890 uint64x1_t
vpadal_u32(uint64x1_t a
, uint32x2_t b
); // VPADAL.U32 d0,d0
891 int16x8_t
vpadalq_s8(int16x8_t a
, int8x16_t b
); // VPADAL.S8 q0,q0
892 int32x4_t
vpadalq_s16(int32x4_t a
, int16x8_t b
); // VPADAL.S16 q0,q0
893 int64x2_t
vpadalq_s32(int64x2_t a
, int32x4_t b
); // VPADAL.S32 q0,q0
894 uint16x8_t
vpadalq_u8(uint16x8_t a
, uint8x16_t b
); // VPADAL.U8 q0,q0
895 uint32x4_t
vpadalq_u16(uint32x4_t a
, uint16x8_t b
); // VPADAL.U16 q0,q0
896 uint64x2_t
vpadalq_u32(uint64x2_t a
, uint32x4_t b
); // VPADAL.U32 q0,q0
897 //Folding maximum vpmax -> takes maximum of adjacent pairs
898 int8x8_t
vpmax_s8(int8x8_t a
, int8x8_t b
); // VPMAX.S8 d0,d0,d0
899 int16x4_t
vpmax_s16(int16x4_t a
, int16x4_t b
); // VPMAX.S16 d0,d0,d0
900 int32x2_t
vpmax_s32(int32x2_t a
, int32x2_t b
); // VPMAX.S32 d0,d0,d0
901 uint8x8_t
vpmax_u8(uint8x8_t a
, uint8x8_t b
); // VPMAX.U8 d0,d0,d0
902 uint16x4_t
vpmax_u16(uint16x4_t a
, uint16x4_t b
); // VPMAX.U16 d0,d0,d0
903 uint32x2_t
vpmax_u32(uint32x2_t a
, uint32x2_t b
); // VPMAX.U32 d0,d0,d0
904 float32x2_t
vpmax_f32(float32x2_t a
, float32x2_t b
); // VPMAX.F32 d0,d0,d0
905 //Folding minimum vpmin -> takes minimum of adjacent pairs
906 int8x8_t
vpmin_s8(int8x8_t a
, int8x8_t b
); // VPMIN.S8 d0,d0,d0
907 int16x4_t
vpmin_s16(int16x4_t a
, int16x4_t b
); // VPMIN.S16 d0,d0,d0
908 int32x2_t
vpmin_s32(int32x2_t a
, int32x2_t b
); // VPMIN.S32 d0,d0,d0
909 uint8x8_t
vpmin_u8(uint8x8_t a
, uint8x8_t b
); // VPMIN.U8 d0,d0,d0
910 uint16x4_t
vpmin_u16(uint16x4_t a
, uint16x4_t b
); // VPMIN.U16 d0,d0,d0
911 uint32x2_t
vpmin_u32(uint32x2_t a
, uint32x2_t b
); // VPMIN.U32 d0,d0,d0
912 float32x2_t
vpmin_f32(float32x2_t a
, float32x2_t b
); // VPMIN.F32 d0,d0,d0
914 float32x2_t
vrecps_f32(float32x2_t a
, float32x2_t b
); // VRECPS.F32 d0, d0, d0
915 float32x4_t
vrecpsq_f32(float32x4_t a
, float32x4_t b
); // VRECPS.F32 q0, q0, q0
916 float32x2_t
vrsqrts_f32(float32x2_t a
, float32x2_t b
); // VRSQRTS.F32 d0, d0, d0
917 float32x4_t
vrsqrtsq_f32(float32x4_t a
, float32x4_t b
); // VRSQRTS.F32 q0, q0, q0
918 //Shifts by signed variable
919 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
920 int8x8_t
vshl_s8(int8x8_t a
, int8x8_t b
); // VSHL.S8 d0,d0,d0
921 int16x4_t
vshl_s16(int16x4_t a
, int16x4_t b
); // VSHL.S16 d0,d0,d0
922 int32x2_t
vshl_s32(int32x2_t a
, int32x2_t b
); // VSHL.S32 d0,d0,d0
923 int64x1_t
vshl_s64(int64x1_t a
, int64x1_t b
); // VSHL.S64 d0,d0,d0
924 uint8x8_t
vshl_u8(uint8x8_t a
, int8x8_t b
); // VSHL.U8 d0,d0,d0
925 uint16x4_t
vshl_u16(uint16x4_t a
, int16x4_t b
); // VSHL.U16 d0,d0,d0
926 uint32x2_t
vshl_u32(uint32x2_t a
, int32x2_t b
); // VSHL.U32 d0,d0,d0
927 uint64x1_t
vshl_u64(uint64x1_t a
, int64x1_t b
); // VSHL.U64 d0,d0,d0
928 int8x16_t
vshlq_s8(int8x16_t a
, int8x16_t b
); // VSHL.S8 q0,q0,q0
929 int16x8_t
vshlq_s16(int16x8_t a
, int16x8_t b
); // VSHL.S16 q0,q0,q0
930 int32x4_t
vshlq_s32(int32x4_t a
, int32x4_t b
); // VSHL.S32 q0,q0,q0
931 int64x2_t
vshlq_s64(int64x2_t a
, int64x2_t b
); // VSHL.S64 q0,q0,q0
932 uint8x16_t
vshlq_u8(uint8x16_t a
, int8x16_t b
); // VSHL.U8 q0,q0,q0
933 uint16x8_t
vshlq_u16(uint16x8_t a
, int16x8_t b
); // VSHL.U16 q0,q0,q0
934 uint32x4_t
vshlq_u32(uint32x4_t a
, int32x4_t b
); // VSHL.U32 q0,q0,q0
935 uint64x2_t
vshlq_u64(uint64x2_t a
, int64x2_t b
); // VSHL.U64 q0,q0,q0
936 //Vector saturating shift left: (negative values shift right)
937 int8x8_t
vqshl_s8(int8x8_t a
, int8x8_t b
); // VQSHL.S8 d0,d0,d0
938 int16x4_t
vqshl_s16(int16x4_t a
, int16x4_t b
); // VQSHL.S16 d0,d0,d0
939 int32x2_t
vqshl_s32(int32x2_t a
, int32x2_t b
); // VQSHL.S32 d0,d0,d0
940 int64x1_t
vqshl_s64(int64x1_t a
, int64x1_t b
); // VQSHL.S64 d0,d0,d0
941 uint8x8_t
vqshl_u8(uint8x8_t a
, int8x8_t b
); // VQSHL.U8 d0,d0,d0
942 uint16x4_t
vqshl_u16(uint16x4_t a
, int16x4_t b
); // VQSHL.U16 d0,d0,d0
943 uint32x2_t
vqshl_u32(uint32x2_t a
, int32x2_t b
); // VQSHL.U32 d0,d0,d0
944 uint64x1_t
vqshl_u64(uint64x1_t a
, int64x1_t b
); // VQSHL.U64 d0,d0,d0
945 int8x16_t
vqshlq_s8(int8x16_t a
, int8x16_t b
); // VQSHL.S8 q0,q0,q0
946 int16x8_t
vqshlq_s16(int16x8_t a
, int16x8_t b
); // VQSHL.S16 q0,q0,q0
947 int32x4_t
vqshlq_s32(int32x4_t a
, int32x4_t b
); // VQSHL.S32 q0,q0,q0
948 int64x2_t
vqshlq_s64(int64x2_t a
, int64x2_t b
); // VQSHL.S64 q0,q0,q0
949 uint8x16_t
vqshlq_u8(uint8x16_t a
, int8x16_t b
); // VQSHL.U8 q0,q0,q0
950 uint16x8_t
vqshlq_u16(uint16x8_t a
, int16x8_t b
); // VQSHL.U16 q0,q0,q0
951 uint32x4_t
vqshlq_u32(uint32x4_t a
, int32x4_t b
); // VQSHL.U32 q0,q0,q0
952 uint64x2_t
vqshlq_u64(uint64x2_t a
, int64x2_t b
); // VQSHL.U64 q0,q0,q0
953 //Vector rounding shift left: (negative values shift right)
954 int8x8_t
vrshl_s8(int8x8_t a
, int8x8_t b
); // VRSHL.S8 d0,d0,d0
955 int16x4_t
vrshl_s16(int16x4_t a
, int16x4_t b
); // VRSHL.S16 d0,d0,d0
956 int32x2_t
vrshl_s32(int32x2_t a
, int32x2_t b
); // VRSHL.S32 d0,d0,d0
957 int64x1_t
vrshl_s64(int64x1_t a
, int64x1_t b
); // VRSHL.S64 d0,d0,d0
958 uint8x8_t
vrshl_u8(uint8x8_t a
, int8x8_t b
); // VRSHL.U8 d0,d0,d0
959 uint16x4_t
vrshl_u16(uint16x4_t a
, int16x4_t b
); // VRSHL.U16 d0,d0,d0
960 uint32x2_t
vrshl_u32(uint32x2_t a
, int32x2_t b
); // VRSHL.U32 d0,d0,d0
961 uint64x1_t
vrshl_u64(uint64x1_t a
, int64x1_t b
); // VRSHL.U64 d0,d0,d0
962 int8x16_t
vrshlq_s8(int8x16_t a
, int8x16_t b
); // VRSHL.S8 q0,q0,q0
963 int16x8_t
vrshlq_s16(int16x8_t a
, int16x8_t b
); // VRSHL.S16 q0,q0,q0
964 int32x4_t
vrshlq_s32(int32x4_t a
, int32x4_t b
); // VRSHL.S32 q0,q0,q0
965 int64x2_t
vrshlq_s64(int64x2_t a
, int64x2_t b
); // VRSHL.S64 q0,q0,q0
966 uint8x16_t
vrshlq_u8(uint8x16_t a
, int8x16_t b
); // VRSHL.U8 q0,q0,q0
967 uint16x8_t
vrshlq_u16(uint16x8_t a
, int16x8_t b
); // VRSHL.U16 q0,q0,q0
968 uint32x4_t
vrshlq_u32(uint32x4_t a
, int32x4_t b
); // VRSHL.U32 q0,q0,q0
969 uint64x2_t
vrshlq_u64(uint64x2_t a
, int64x2_t b
); // VRSHL.U64 q0,q0,q0
970 //Vector saturating rounding shift left: (negative values shift right)
971 int8x8_t
vqrshl_s8(int8x8_t a
, int8x8_t b
); // VQRSHL.S8 d0,d0,d0
972 int16x4_t
vqrshl_s16(int16x4_t a
, int16x4_t b
); // VQRSHL.S16 d0,d0,d0
973 int32x2_t
vqrshl_s32(int32x2_t a
, int32x2_t b
); // VQRSHL.S32 d0,d0,d0
974 int64x1_t
vqrshl_s64(int64x1_t a
, int64x1_t b
); // VQRSHL.S64 d0,d0,d0
975 uint8x8_t
vqrshl_u8(uint8x8_t a
, int8x8_t b
); // VQRSHL.U8 d0,d0,d0
976 uint16x4_t
vqrshl_u16(uint16x4_t a
, int16x4_t b
); // VQRSHL.U16 d0,d0,d0
977 uint32x2_t
vqrshl_u32(uint32x2_t a
, int32x2_t b
); // VQRSHL.U32 d0,d0,d0
978 uint64x1_t
vqrshl_u64(uint64x1_t a
, int64x1_t b
); // VQRSHL.U64 d0,d0,d0
979 int8x16_t
vqrshlq_s8(int8x16_t a
, int8x16_t b
); // VQRSHL.S8 q0,q0,q0
980 int16x8_t
vqrshlq_s16(int16x8_t a
, int16x8_t b
); // VQRSHL.S16 q0,q0,q0
981 int32x4_t
vqrshlq_s32(int32x4_t a
, int32x4_t b
); // VQRSHL.S32 q0,q0,q0
982 int64x2_t
vqrshlq_s64(int64x2_t a
, int64x2_t b
); // VQRSHL.S64 q0,q0,q0
983 uint8x16_t
vqrshlq_u8(uint8x16_t a
, int8x16_t b
); // VQRSHL.U8 q0,q0,q0
984 uint16x8_t
vqrshlq_u16(uint16x8_t a
, int16x8_t b
); // VQRSHL.U16 q0,q0,q0
985 uint32x4_t
vqrshlq_u32(uint32x4_t a
, int32x4_t b
); // VQRSHL.U32 q0,q0,q0
986 uint64x2_t
vqrshlq_u64(uint64x2_t a
, int64x2_t b
); // VQRSHL.U64 q0,q0,q0
987 //Shifts by a constant
988 //Vector shift right by constant
989 int8x8_t
vshr_n_s8(int8x8_t a
, __constrange(1,8) int b
); // VSHR.S8 d0,d0,#8
990 int16x4_t
vshr_n_s16(int16x4_t a
, __constrange(1,16) int b
); // VSHR.S16 d0,d0,#16
991 int32x2_t
vshr_n_s32(int32x2_t a
, __constrange(1,32) int b
); // VSHR.S32 d0,d0,#32
992 int64x1_t
vshr_n_s64(int64x1_t a
, __constrange(1,64) int b
); // VSHR.S64 d0,d0,#64
993 uint8x8_t
vshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
); // VSHR.U8 d0,d0,#8
994 uint16x4_t
vshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
); // VSHR.U16 d0,d0,#16
995 uint32x2_t
vshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
); // VSHR.U32 d0,d0,#32
996 uint64x1_t
vshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
); // VSHR.U64 d0,d0,#64
997 int8x16_t
vshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
); // VSHR.S8 q0,q0,#8
998 int16x8_t
vshrq_n_s16(int16x8_t a
, __constrange(1,16) int b
); // VSHR.S16 q0,q0,#16
999 int32x4_t
vshrq_n_s32(int32x4_t a
, __constrange(1,32) int b
); // VSHR.S32 q0,q0,#32
1000 int64x2_t
vshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
); // VSHR.S64 q0,q0,#64
1001 uint8x16_t
vshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
); // VSHR.U8 q0,q0,#8
1002 uint16x8_t
vshrq_n_u16(uint16x8_t a
, __constrange(1,16) int b
); // VSHR.U16 q0,q0,#16
1003 uint32x4_t
vshrq_n_u32(uint32x4_t a
, __constrange(1,32) int b
); // VSHR.U32 q0,q0,#32
1004 uint64x2_t
vshrq_n_u64(uint64x2_t a
, __constrange(1,64) int b
); // VSHR.U64 q0,q0,#64
1005 //Vector shift left by constant
1006 int8x8_t
vshl_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VSHL.I8 d0,d0,#0
1007 int16x4_t
vshl_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VSHL.I16 d0,d0,#0
1008 int32x2_t
vshl_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VSHL.I32 d0,d0,#0
1009 int64x1_t
vshl_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VSHL.I64 d0,d0,#0
1010 uint8x8_t
vshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
); // VSHL.I8 d0,d0,#0
1011 uint16x4_t
vshl_n_u16(uint16x4_t a
, __constrange(0,15) int b
); // VSHL.I16 d0,d0,#0
1012 uint32x2_t
vshl_n_u32(uint32x2_t a
, __constrange(0,31) int b
); // VSHL.I32 d0,d0,#0
1013 uint64x1_t
vshl_n_u64(uint64x1_t a
, __constrange(0,63) int b
); // VSHL.I64 d0,d0,#0
1014 int8x16_t
vshlq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VSHL.I8 q0,q0,#0
1015 int16x8_t
vshlq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VSHL.I16 q0,q0,#0
1016 int32x4_t
vshlq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VSHL.I32 q0,q0,#0
1017 int64x2_t
vshlq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VSHL.I64 q0,q0,#0
1018 uint8x16_t
vshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
); // VSHL.I8 q0,q0,#0
1019 uint16x8_t
vshlq_n_u16(uint16x8_t a
, __constrange(0,15) int b
); // VSHL.I16 q0,q0,#0
1020 uint32x4_t
vshlq_n_u32(uint32x4_t a
, __constrange(0,31) int b
); // VSHL.I32 q0,q0,#0
1021 uint64x2_t
vshlq_n_u64(uint64x2_t a
, __constrange(0,63) int b
); // VSHL.I64 q0,q0,#0
1022 //Vector rounding shift right by constant
1023 int8x8_t
vrshr_n_s8(int8x8_t a
, __constrange(1,8) int b
); // VRSHR.S8 d0,d0,#8
1024 int16x4_t
vrshr_n_s16(int16x4_t a
, __constrange(1,16) int b
); // VRSHR.S16 d0,d0,#16
1025 int32x2_t
vrshr_n_s32(int32x2_t a
, __constrange(1,32) int b
); // VRSHR.S32 d0,d0,#32
1026 int64x1_t
vrshr_n_s64(int64x1_t a
, __constrange(1,64) int b
); // VRSHR.S64 d0,d0,#64
1027 uint8x8_t
vrshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
); // VRSHR.U8 d0,d0,#8
1028 uint16x4_t
vrshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
); // VRSHR.U16 d0,d0,#16
1029 uint32x2_t
vrshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
); // VRSHR.U32 d0,d0,#32
1030 uint64x1_t
vrshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
); // VRSHR.U64 d0,d0,#64
1031 int8x16_t
vrshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
); // VRSHR.S8 q0,q0,#8
1032 int16x8_t
vrshrq_n_s16(int16x8_t a
, __constrange(1,16) int b
); // VRSHR.S16 q0,q0,#16
1033 int32x4_t
vrshrq_n_s32(int32x4_t a
, __constrange(1,32) int b
); // VRSHR.S32 q0,q0,#32
1034 int64x2_t
vrshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
); // VRSHR.S64 q0,q0,#64
1035 uint8x16_t
vrshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
); // VRSHR.U8 q0,q0,#8
1036 uint16x8_t
vrshrq_n_u16(uint16x8_t a
, __constrange(1,16) int b
); // VRSHR.U16 q0,q0,#16
1037 uint32x4_t
vrshrq_n_u32(uint32x4_t a
, __constrange(1,32) int b
); // VRSHR.U32 q0,q0,#32
1038 uint64x2_t
vrshrq_n_u64(uint64x2_t a
, __constrange(1,64) int b
); // VRSHR.U64 q0,q0,#64
1039 //Vector shift right by constant and accumulate
1040 int8x8_t
vsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VSRA.S8 d0,d0,#8
1041 int16x4_t
vsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VSRA.S16 d0,d0,#16
1042 int32x2_t
vsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VSRA.S32 d0,d0,#32
1043 int64x1_t
vsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VSRA.S64 d0,d0,#64
1044 uint8x8_t
vsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VSRA.U8 d0,d0,#8
1045 uint16x4_t
vsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VSRA.U16 d0,d0,#16
1046 uint32x2_t
vsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VSRA.U32 d0,d0,#32
1047 uint64x1_t
vsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VSRA.U64 d0,d0,#64
1048 int8x16_t
vsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VSRA.S8 q0,q0,#8
1049 int16x8_t
vsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VSRA.S16 q0,q0,#16
1050 int32x4_t
vsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VSRA.S32 q0,q0,#32
1051 int64x2_t
vsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VSRA.S64 q0,q0,#64
1052 uint8x16_t
vsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VSRA.U8 q0,q0,#8
1053 uint16x8_t
vsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VSRA.U16 q0,q0,#16
1054 uint32x4_t
vsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VSRA.U32 q0,q0,#32
1055 uint64x2_t
vsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VSRA.U64 q0,q0,#64
1056 //Vector rounding shift right by constant and accumulate
1057 int8x8_t
vrsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VRSRA.S8 d0,d0,#8
1058 int16x4_t
vrsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VRSRA.S16 d0,d0,#16
1059 int32x2_t
vrsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VRSRA.S32 d0,d0,#32
1060 int64x1_t
vrsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VRSRA.S64 d0,d0,#64
1061 uint8x8_t
vrsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VRSRA.U8 d0,d0,#8
1062 uint16x4_t
vrsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VRSRA.U16 d0,d0,#16
1063 uint32x2_t
vrsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VRSRA.U32 d0,d0,#32
1064 uint64x1_t
vrsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VRSRA.U64 d0,d0,#64
1065 int8x16_t
vrsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VRSRA.S8 q0,q0,#8
1066 int16x8_t
vrsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VRSRA.S16 q0,q0,#16
1067 int32x4_t
vrsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VRSRA.S32 q0,q0,#32
1068 int64x2_t
vrsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VRSRA.S64 q0,q0,#64
1069 uint8x16_t
vrsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VRSRA.U8 q0,q0,#8
1070 uint16x8_t
vrsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VRSRA.U16 q0,q0,#16
1071 uint32x4_t
vrsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VRSRA.U32 q0,q0,#32
1072 uint64x2_t
vrsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VRSRA.U64 q0,q0,#64
1073 //Vector saturating shift left by constant
1074 int8x8_t
vqshl_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VQSHL.S8 d0,d0,#0
1075 int16x4_t
vqshl_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VQSHL.S16 d0,d0,#0
1076 int32x2_t
vqshl_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VQSHL.S32 d0,d0,#0
1077 int64x1_t
vqshl_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VQSHL.S64 d0,d0,#0
1078 uint8x8_t
vqshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
); // VQSHL.U8 d0,d0,#0
1079 uint16x4_t
vqshl_n_u16(uint16x4_t a
, __constrange(0,15) int b
); // VQSHL.U16 d0,d0,#0
1080 uint32x2_t
vqshl_n_u32(uint32x2_t a
, __constrange(0,31) int b
); // VQSHL.U32 d0,d0,#0
1081 uint64x1_t
vqshl_n_u64(uint64x1_t a
, __constrange(0,63) int b
); // VQSHL.U64 d0,d0,#0
1082 int8x16_t
vqshlq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VQSHL.S8 q0,q0,#0
1083 int16x8_t
vqshlq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VQSHL.S16 q0,q0,#0
1084 int32x4_t
vqshlq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VQSHL.S32 q0,q0,#0
1085 int64x2_t
vqshlq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VQSHL.S64 q0,q0,#0
1086 uint8x16_t
vqshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
); // VQSHL.U8 q0,q0,#0
1087 uint16x8_t
vqshlq_n_u16(uint16x8_t a
, __constrange(0,15) int b
); // VQSHL.U16 q0,q0,#0
1088 uint32x4_t
vqshlq_n_u32(uint32x4_t a
, __constrange(0,31) int b
); // VQSHL.U32 q0,q0,#0
1089 uint64x2_t
vqshlq_n_u64(uint64x2_t a
, __constrange(0,63) int b
); // VQSHL.U64 q0,q0,#0
1090 //Vector signed->unsigned saturating shift left by constant
1091 uint8x8_t
vqshlu_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VQSHLU.S8 d0,d0,#0
1092 uint16x4_t
vqshlu_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VQSHLU.S16 d0,d0,#0
1093 uint32x2_t
vqshlu_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VQSHLU.S32 d0,d0,#0
1094 uint64x1_t
vqshlu_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VQSHLU.S64 d0,d0,#0
1095 uint8x16_t
vqshluq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VQSHLU.S8 q0,q0,#0
1096 uint16x8_t
vqshluq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VQSHLU.S16 q0,q0,#0
1097 uint32x4_t
vqshluq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VQSHLU.S32 q0,q0,#0
1098 uint64x2_t
vqshluq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VQSHLU.S64 q0,q0,#0
1099 //Vector narrowing shift right by constant
1100 int8x8_t
vshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VSHRN.I16 d0,q0,#8
1101 int16x4_t
vshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VSHRN.I32 d0,q0,#16
1102 int32x2_t
vshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VSHRN.I64 d0,q0,#32
1103 uint8x8_t
vshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VSHRN.I16 d0,q0,#8
1104 uint16x4_t
vshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VSHRN.I32 d0,q0,#16
1105 uint32x2_t
vshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VSHRN.I64 d0,q0,#32
1106 //Vector signed->unsigned narrowing saturating shift right by constant
1107 uint8x8_t
vqshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQSHRUN.S16 d0,q0,#8
1108 uint16x4_t
vqshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQSHRUN.S32 d0,q0,#16
1109 uint32x2_t
vqshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQSHRUN.S64 d0,q0,#32
1110 //Vector signed->unsigned rounding narrowing saturating shift right by constant
1111 uint8x8_t
vqrshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQRSHRUN.S16 d0,q0,#8
1112 uint16x4_t
vqrshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQRSHRUN.S32 d0,q0,#16
1113 uint32x2_t
vqrshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQRSHRUN.S64 d0,q0,#32
1114 //Vector narrowing saturating shift right by constant
1115 int8x8_t
vqshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQSHRN.S16 d0,q0,#8
1116 int16x4_t
vqshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQSHRN.S32 d0,q0,#16
1117 int32x2_t
vqshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQSHRN.S64 d0,q0,#32
1118 uint8x8_t
vqshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VQSHRN.U16 d0,q0,#8
1119 uint16x4_t
vqshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VQSHRN.U32 d0,q0,#16
1120 uint32x2_t
vqshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VQSHRN.U64 d0,q0,#32
1121 //Vector rounding narrowing shift right by constant
1122 int8x8_t
vrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VRSHRN.I16 d0,q0,#8
1123 int16x4_t
vrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VRSHRN.I32 d0,q0,#16
1124 int32x2_t
vrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VRSHRN.I64 d0,q0,#32
1125 uint8x8_t
vrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VRSHRN.I16 d0,q0,#8
1126 uint16x4_t
vrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VRSHRN.I32 d0,q0,#16
1127 uint32x2_t
vrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VRSHRN.I64 d0,q0,#32
1128 //Vector rounding narrowing saturating shift right by constant
1129 int8x8_t
vqrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQRSHRN.S16 d0,q0,#8
1130 int16x4_t
vqrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQRSHRN.S32 d0,q0,#16
1131 int32x2_t
vqrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQRSHRN.S64 d0,q0,#32
1132 uint8x8_t
vqrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VQRSHRN.U16 d0,q0,#8
1133 uint16x4_t
vqrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VQRSHRN.U32 d0,q0,#16
1134 uint32x2_t
vqrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VQRSHRN.U64 d0,q0,#32
1135 //Vector widening shift left by constant
1136 int16x8_t
vshll_n_s8(int8x8_t a
, __constrange(0,8) int b
); // VSHLL.S8 q0,d0,#0
1137 int32x4_t
vshll_n_s16(int16x4_t a
, __constrange(0,16) int b
); // VSHLL.S16 q0,d0,#0
1138 int64x2_t
vshll_n_s32(int32x2_t a
, __constrange(0,32) int b
); // VSHLL.S32 q0,d0,#0
1139 uint16x8_t
vshll_n_u8(uint8x8_t a
, __constrange(0,8) int b
); // VSHLL.U8 q0,d0,#0
1140 uint32x4_t
vshll_n_u16(uint16x4_t a
, __constrange(0,16) int b
); // VSHLL.U16 q0,d0,#0
1141 uint64x2_t
vshll_n_u32(uint32x2_t a
, __constrange(0,32) int b
); // VSHLL.U32 q0,d0,#0
1142 //Shifts with insert
1143 //Vector shift right and insert
1144 int8x8_t
vsri_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
1145 int16x4_t
vsri_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
1146 int32x2_t
vsri_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VSRI.32 d0,d0,#32
1147 int64x1_t
vsri_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VSRI.64 d0,d0,#64
1148 uint8x8_t
vsri_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
1149 uint16x4_t
vsri_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
1150 uint32x2_t
vsri_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VSRI.32 d0,d0,#32
1151 uint64x1_t
vsri_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VSRI.64 d0,d0,#64
1152 poly8x8_t
vsri_n_p8(poly8x8_t a
, poly8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
1153 poly16x4_t
vsri_n_p16(poly16x4_t a
, poly16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
1154 int8x16_t
vsriq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
1155 int16x8_t
vsriq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
1156 int32x4_t
vsriq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VSRI.32 q0,q0,#32
1157 int64x2_t
vsriq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VSRI.64 q0,q0,#64
1158 uint8x16_t
vsriq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
1159 uint16x8_t
vsriq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
1160 uint32x4_t
vsriq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VSRI.32 q0,q0,#32
1161 uint64x2_t
vsriq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VSRI.64 q0,q0,#64
1162 poly8x16_t
vsriq_n_p8(poly8x16_t a
, poly8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
1163 poly16x8_t
vsriq_n_p16(poly16x8_t a
, poly16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
1164 //Vector shift left and insert
1165 int8x8_t
vsli_n_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
1166 int16x4_t
vsli_n_s16(int16x4_t a
, int16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
1167 int32x2_t
vsli_n_s32(int32x2_t a
, int32x2_t b
, __constrange(0,31) int c
); // VSLI.32 d0,d0,#0
1168 int64x1_t
vsli_n_s64(int64x1_t a
, int64x1_t b
, __constrange(0,63) int c
); // VSLI.64 d0,d0,#0
1169 uint8x8_t
vsli_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
1170 uint16x4_t
vsli_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
1171 uint32x2_t
vsli_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(0,31) int c
); // VSLI.32 d0,d0,#0
1172 uint64x1_t
vsli_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(0,63) int c
); // VSLI.64 d0,d0,#0
1173 poly8x8_t
vsli_n_p8(poly8x8_t a
, poly8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
1174 poly16x4_t
vsli_n_p16(poly16x4_t a
, poly16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
1175 int8x16_t
vsliq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
1176 int16x8_t
vsliq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
1177 int32x4_t
vsliq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(0,31) int c
); // VSLI.32 q0,q0,#0
1178 int64x2_t
vsliq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(0,63) int c
); // VSLI.64 q0,q0,#0
1179 uint8x16_t
vsliq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
1180 uint16x8_t
vsliq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
1181 uint32x4_t
vsliq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(0,31) int c
); // VSLI.32 q0,q0,#0
1182 uint64x2_t
vsliq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(0,63) int c
); // VSLI.64 q0,q0,#0
1183 poly8x16_t
vsliq_n_p8(poly8x16_t a
, poly8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
1184 poly16x8_t
vsliq_n_p16(poly16x8_t a
, poly16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
1185 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
1186 //Load a single vector from memory
1187 uint8x16_t
vld1q_u8(__transfersize(16) uint8_t const * ptr
); // VLD1.8 {d0, d1}, [r0]
1188 uint16x8_t
vld1q_u16(__transfersize(8) uint16_t const * ptr
); // VLD1.16 {d0, d1}, [r0]
1189 uint32x4_t
vld1q_u32(__transfersize(4) uint32_t const * ptr
); // VLD1.32 {d0, d1}, [r0]
1190 uint64x2_t
vld1q_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1191 int8x16_t
vld1q_s8(__transfersize(16) int8_t const * ptr
); // VLD1.8 {d0, d1}, [r0]
1192 int16x8_t
vld1q_s16(__transfersize(8) int16_t const * ptr
); // VLD1.16 {d0, d1}, [r0]
1193 int32x4_t
vld1q_s32(__transfersize(4) int32_t const * ptr
); // VLD1.32 {d0, d1}, [r0]
1194 int64x2_t
vld1q_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1195 float16x8_t
vld1q_f16(__transfersize(8) __fp16
const * ptr
); // VLD1.16 {d0, d1}, [r0]
1196 float32x4_t
vld1q_f32(__transfersize(4) float32_t
const * ptr
); // VLD1.32 {d0, d1}, [r0]
1197 poly8x16_t
vld1q_p8(__transfersize(16) poly8_t
const * ptr
); // VLD1.8 {d0, d1}, [r0]
1198 poly16x8_t
vld1q_p16(__transfersize(8) poly16_t
const * ptr
); // VLD1.16 {d0, d1}, [r0]
1199 uint8x8_t
vld1_u8(__transfersize(8) uint8_t const * ptr
); // VLD1.8 {d0}, [r0]
1200 uint16x4_t
vld1_u16(__transfersize(4) uint16_t const * ptr
); // VLD1.16 {d0}, [r0]
1201 uint32x2_t
vld1_u32(__transfersize(2) uint32_t const * ptr
); // VLD1.32 {d0}, [r0]
1202 uint64x1_t
vld1_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
1203 int8x8_t
vld1_s8(__transfersize(8) int8_t const * ptr
); // VLD1.8 {d0}, [r0]
1204 int16x4_t
vld1_s16(__transfersize(4) int16_t const * ptr
); // VLD1.16 {d0}, [r0]
1205 int32x2_t
vld1_s32(__transfersize(2) int32_t const * ptr
); // VLD1.32 {d0}, [r0]
1206 int64x1_t
vld1_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
1207 float16x4_t
vld1_f16(__transfersize(4) __fp16
const * ptr
); // VLD1.16 {d0}, [r0]
1208 float32x2_t
vld1_f32(__transfersize(2) float32_t
const * ptr
); // VLD1.32 {d0}, [r0]
1209 poly8x8_t
vld1_p8(__transfersize(8) poly8_t
const * ptr
); // VLD1.8 {d0}, [r0]
1210 poly16x4_t
vld1_p16(__transfersize(4) poly16_t
const * ptr
); // VLD1.16 {d0}, [r0]
1211 //Load a single lane from memory
1212 uint8x16_t
vld1q_lane_u8(__transfersize(1) uint8_t const * ptr
, uint8x16_t vec
, __constrange(0,15) int lane
); //VLD1.8 {d0[0]}, [r0]
1213 uint16x8_t
vld1q_lane_u16(__transfersize(1) uint16_t const * ptr
, uint16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
1214 uint32x4_t
vld1q_lane_u32(__transfersize(1) uint32_t const * ptr
, uint32x4_t vec
, __constrange(0,3) int lane
); // VLD1.32 {d0[0]}, [r0]
1215 uint64x2_t
vld1q_lane_u64(__transfersize(1) uint64_t const * ptr
, uint64x2_t vec
, __constrange(0,1) int lane
); // VLD1.64 {d0}, [r0]
1216 int8x16_t
vld1q_lane_s8(__transfersize(1) int8_t const * ptr
, int8x16_t vec
, __constrange(0,15) int lane
); //VLD1.8 {d0[0]}, [r0]
1217 int16x8_t
vld1q_lane_s16(__transfersize(1) int16_t const * ptr
, int16x8_t vec
, __constrange(0,7) int lane
); //VLD1.16 {d0[0]}, [r0]
1218 int32x4_t
vld1q_lane_s32(__transfersize(1) int32_t const * ptr
, int32x4_t vec
, __constrange(0,3) int lane
); //VLD1.32 {d0[0]}, [r0]
1219 float16x8_t
vld1q_lane_f16(__transfersize(1) __fp16
const * ptr
, float16x8_t vec
, __constrange(0,7) int lane
); //VLD1.16 {d0[0]}, [r0]
1220 float32x4_t
vld1q_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x4_t vec
, __constrange(0,3) int lane
); // VLD1.32 {d0[0]}, [r0]
1221 int64x2_t
vld1q_lane_s64(__transfersize(1) int64_t const * ptr
, int64x2_t vec
, __constrange(0,1) int lane
); //VLD1.64 {d0}, [r0]
1222 poly8x16_t
vld1q_lane_p8(__transfersize(1) poly8_t
const * ptr
, poly8x16_t vec
, __constrange(0,15) int lane
); //VLD1.8 {d0[0]}, [r0]
1223 poly16x8_t
vld1q_lane_p16(__transfersize(1) poly16_t
const * ptr
, poly16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
1224 uint8x8_t
vld1_lane_u8(__transfersize(1) uint8_t const * ptr
, uint8x8_t vec
, __constrange(0,7) int lane
); //VLD1.8 {d0[0]}, [r0]
1225 uint16x4_t
vld1_lane_u16(__transfersize(1) uint16_t const * ptr
, uint16x4_t vec
, __constrange(0,3) int lane
); //VLD1.16 {d0[0]}, [r0]
1226 uint32x2_t
vld1_lane_u32(__transfersize(1) uint32_t const * ptr
, uint32x2_t vec
, __constrange(0,1) int lane
); //VLD1.32 {d0[0]}, [r0]
1227 uint64x1_t
vld1_lane_u64(__transfersize(1) uint64_t const * ptr
, uint64x1_t vec
, __constrange(0,0) int lane
); //VLD1.64 {d0}, [r0]
1228 int8x8_t
vld1_lane_s8(__transfersize(1) int8_t const * ptr
, int8x8_t vec
, __constrange(0,7) int lane
); // VLD1.8{d0[0]}, [r0]
1229 int16x4_t
vld1_lane_s16(__transfersize(1) int16_t const * ptr
, int16x4_t vec
, __constrange(0,3) int lane
); //VLD1.16 {d0[0]}, [r0]
1230 int32x2_t
vld1_lane_s32(__transfersize(1) int32_t const * ptr
, int32x2_t vec
, __constrange(0,1) int lane
); //VLD1.32 {d0[0]}, [r0]
1231 float16x4_t
vld1q_lane_f16(__transfersize(1) __fp16
const * ptr
, float16x4_t vec
, __constrange(0,3) int lane
); //VLD1.16 {d0[0]}, [r0]
1232 float32x2_t
vld1_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x2_t vec
, __constrange(0,1) int lane
); // VLD1.32 {d0[0]}, [r0]
1233 int64x1_t
vld1_lane_s64(__transfersize(1) int64_t const * ptr
, int64x1_t vec
, __constrange(0,0) int lane
); //VLD1.64 {d0}, [r0]
1234 poly8x8_t
vld1_lane_p8(__transfersize(1) poly8_t
const * ptr
, poly8x8_t vec
, __constrange(0,7) int lane
); //VLD1.8 {d0[0]}, [r0]
1235 poly16x4_t
vld1_lane_p16(__transfersize(1) poly16_t
const * ptr
, poly16x4_t vec
, __constrange(0,3) int lane
); //VLD1.16 {d0[0]}, [r0]
1236 //Load all lanes of vector with same value from memory
1237 uint8x16_t
vld1q_dup_u8(__transfersize(1) uint8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
1238 uint16x8_t
vld1q_dup_u16(__transfersize(1) uint16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
1239 uint32x4_t
vld1q_dup_u32(__transfersize(1) uint32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
1240 uint64x2_t
vld1q_dup_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
1241 int8x16_t
vld1q_dup_s8(__transfersize(1) int8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
1242 int16x8_t
vld1q_dup_s16(__transfersize(1) int16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
1243 int32x4_t
vld1q_dup_s32(__transfersize(1) int32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
1244 int64x2_t
vld1q_dup_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
1245 float16x8_t
vld1q_dup_f16(__transfersize(1) __fp16
const * ptr
); // VLD1.16 {d0[]}, [r0]
1246 float32x4_t
vld1q_dup_f32(__transfersize(1) float32_t
const * ptr
); // VLD1.32 {d0[]}, [r0]
1247 poly8x16_t
vld1q_dup_p8(__transfersize(1) poly8_t
const * ptr
); // VLD1.8 {d0[]}, [r0]
1248 poly16x8_t
vld1q_dup_p16(__transfersize(1) poly16_t
const * ptr
); // VLD1.16 {d0[]}, [r0]
1249 uint8x8_t
vld1_dup_u8(__transfersize(1) uint8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
1250 uint16x4_t
vld1_dup_u16(__transfersize(1) uint16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
1251 uint32x2_t
vld1_dup_u32(__transfersize(1) uint32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
1252 uint64x1_t
vld1_dup_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
1253 int8x8_t
vld1_dup_s8(__transfersize(1) int8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
1254 int16x4_t
vld1_dup_s16(__transfersize(1) int16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
1255 int32x2_t
vld1_dup_s32(__transfersize(1) int32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
1256 int64x1_t
vld1_dup_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
1257 float16x4_t
vld1_dup_f16(__transfersize(1) __fp16
const * ptr
); // VLD1.16 {d0[]}, [r0]
1258 float32x2_t
vld1_dup_f32(__transfersize(1) float32_t
const * ptr
); // VLD1.32 {d0[]}, [r0]
1259 poly8x8_t
vld1_dup_p8(__transfersize(1) poly8_t
const * ptr
); // VLD1.8 {d0[]}, [r0]
1260 poly16x4_t
vld1_dup_p16(__transfersize(1) poly16_t
const * ptr
); // VLD1.16 {d0[]}, [r0]
1261 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
1262 //Store a single vector into memory
1263 void vst1q_u8(__transfersize(16) uint8_t * ptr
, uint8x16_t val
); // VST1.8 {d0, d1}, [r0]
1264 void vst1q_u16(__transfersize(8) uint16_t * ptr
, uint16x8_t val
); // VST1.16 {d0, d1}, [r0]
1265 void vst1q_u32(__transfersize(4) uint32_t * ptr
, uint32x4_t val
); // VST1.32 {d0, d1}, [r0]
1266 void vst1q_u64(__transfersize(2) uint64_t * ptr
, uint64x2_t val
); // VST1.64 {d0, d1}, [r0]
1267 void vst1q_s8(__transfersize(16) int8_t * ptr
, int8x16_t val
); // VST1.8 {d0, d1}, [r0]
1268 void vst1q_s16(__transfersize(8) int16_t * ptr
, int16x8_t val
); // VST1.16 {d0, d1}, [r0]
1269 void vst1q_s32(__transfersize(4) int32_t * ptr
, int32x4_t val
); // VST1.32 {d0, d1}, [r0]
1270 void vst1q_s64(__transfersize(2) int64_t * ptr
, int64x2_t val
); // VST1.64 {d0, d1}, [r0]
1271 void vst1q_f16(__transfersize(8) __fp16
* ptr
, float16x8_t val
); // VST1.16 {d0, d1}, [r0]
1272 void vst1q_f32(__transfersize(4) float32_t
* ptr
, float32x4_t val
); // VST1.32 {d0, d1}, [r0]
1273 void vst1q_p8(__transfersize(16) poly8_t
* ptr
, poly8x16_t val
); // VST1.8 {d0, d1}, [r0]
1274 void vst1q_p16(__transfersize(8) poly16_t
* ptr
, poly16x8_t val
); // VST1.16 {d0, d1}, [r0]
1275 void vst1_u8(__transfersize(8) uint8_t * ptr
, uint8x8_t val
); // VST1.8 {d0}, [r0]
1276 void vst1_u16(__transfersize(4) uint16_t * ptr
, uint16x4_t val
); // VST1.16 {d0}, [r0]
1277 void vst1_u32(__transfersize(2) uint32_t * ptr
, uint32x2_t val
); // VST1.32 {d0}, [r0]
1278 void vst1_u64(__transfersize(1) uint64_t * ptr
, uint64x1_t val
); // VST1.64 {d0}, [r0]
1279 void vst1_s8(__transfersize(8) int8_t * ptr
, int8x8_t val
); // VST1.8 {d0}, [r0]
1280 void vst1_s16(__transfersize(4) int16_t * ptr
, int16x4_t val
); // VST1.16 {d0}, [r0]
1281 void vst1_s32(__transfersize(2) int32_t * ptr
, int32x2_t val
); // VST1.32 {d0}, [r0]
1282 void vst1_s64(__transfersize(1) int64_t * ptr
, int64x1_t val
); // VST1.64 {d0}, [r0]
1283 void vst1_f16(__transfersize(4) __fp16
* ptr
, float16x4_t val
); // VST1.16 {d0}, [r0]
1284 void vst1_f32(__transfersize(2) float32_t
* ptr
, float32x2_t val
); // VST1.32 {d0}, [r0]
1285 void vst1_p8(__transfersize(8) poly8_t
* ptr
, poly8x8_t val
); // VST1.8 {d0}, [r0]
1286 void vst1_p16(__transfersize(4) poly16_t
* ptr
, poly16x4_t val
); // VST1.16 {d0}, [r0]
1287 //Store a lane of a vector into memory
1288 //Loads of an N-element structure
1289 //Load N-element structure from memory
1290 uint8x16x2_t
vld2q_u8(__transfersize(32) uint8_t const * ptr
); // VLD2.8 {d0, d2}, [r0]
1291 uint16x8x2_t
vld2q_u16(__transfersize(16) uint16_t const * ptr
); // VLD2.16 {d0, d2}, [r0]
1292 uint32x4x2_t
vld2q_u32(__transfersize(8) uint32_t const * ptr
); // VLD2.32 {d0, d2}, [r0]
1293 int8x16x2_t
vld2q_s8(__transfersize(32) int8_t const * ptr
); // VLD2.8 {d0, d2}, [r0]
1294 int16x8x2_t
vld2q_s16(__transfersize(16) int16_t const * ptr
); // VLD2.16 {d0, d2}, [r0]
1295 int32x4x2_t
vld2q_s32(__transfersize(8) int32_t const * ptr
); // VLD2.32 {d0, d2}, [r0]
1296 float16x8x2_t
vld2q_f16(__transfersize(16) __fp16
const * ptr
); // VLD2.16 {d0, d2}, [r0]
1297 float32x4x2_t
vld2q_f32(__transfersize(8) float32_t
const * ptr
); // VLD2.32 {d0, d2}, [r0]
1298 poly8x16x2_t
vld2q_p8(__transfersize(32) poly8_t
const * ptr
); // VLD2.8 {d0, d2}, [r0]
1299 poly16x8x2_t
vld2q_p16(__transfersize(16) poly16_t
const * ptr
); // VLD2.16 {d0, d2}, [r0]
1300 uint8x8x2_t
vld2_u8(__transfersize(16) uint8_t const * ptr
); // VLD2.8 {d0, d1}, [r0]
1301 uint16x4x2_t
vld2_u16(__transfersize(8) uint16_t const * ptr
); // VLD2.16 {d0, d1}, [r0]
1302 uint32x2x2_t
vld2_u32(__transfersize(4) uint32_t const * ptr
); // VLD2.32 {d0, d1}, [r0]
1303 uint64x1x2_t
vld2_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1304 int8x8x2_t
vld2_s8(__transfersize(16) int8_t const * ptr
); // VLD2.8 {d0, d1}, [r0]
1305 int16x4x2_t
vld2_s16(__transfersize(8) int16_t const * ptr
); // VLD2.16 {d0, d1}, [r0]
1306 int32x2x2_t
vld2_s32(__transfersize(4) int32_t const * ptr
); // VLD2.32 {d0, d1}, [r0]
1307 int64x1x2_t
vld2_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1308 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
1309 float32x2x2_t
vld2_f32(__transfersize(4) float32_t
const * ptr
); // VLD2.32 {d0, d1}, [r0]
1310 poly8x8x2_t
vld2_p8(__transfersize(16) poly8_t
const * ptr
); // VLD2.8 {d0, d1}, [r0]
1311 poly16x4x2_t
vld2_p16(__transfersize(8) poly16_t
const * ptr
); // VLD2.16 {d0, d1}, [r0]
1312 uint8x16x3_t
vld3q_u8(__transfersize(48) uint8_t const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
1313 uint16x8x3_t
vld3q_u16(__transfersize(24) uint16_t const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
1314 uint32x4x3_t
vld3q_u32(__transfersize(12) uint32_t const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
1315 int8x16x3_t
vld3q_s8(__transfersize(48) int8_t const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
1316 int16x8x3_t
vld3q_s16(__transfersize(24) int16_t const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
1317 int32x4x3_t
vld3q_s32(__transfersize(12) int32_t const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
1318 float16x8x3_t
vld3q_f16(__transfersize(24) __fp16
const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
1319 float32x4x3_t
vld3q_f32(__transfersize(12) float32_t
const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
1320 poly8x16x3_t
vld3q_p8(__transfersize(48) poly8_t
const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
1321 poly16x8x3_t
vld3q_p16(__transfersize(24) poly16_t
const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
1322 uint8x8x3_t
vld3_u8(__transfersize(24) uint8_t const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
1323 uint16x4x3_t
vld3_u16(__transfersize(12) uint16_t const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
1324 uint32x2x3_t
vld3_u32(__transfersize(6) uint32_t const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
1325 uint64x1x3_t
vld3_u64(__transfersize(3) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
1326 int8x8x3_t
vld3_s8(__transfersize(24) int8_t const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
1327 int16x4x3_t
vld3_s16(__transfersize(12) int16_t const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
1328 int32x2x3_t
vld3_s32(__transfersize(6) int32_t const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
1329 int64x1x3_t
vld3_s64(__transfersize(3) int64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
1330 float16x4x3_t
vld3_f16(__transfersize(12) __fp16
const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
1331 float32x2x3_t
vld3_f32(__transfersize(6) float32_t
const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
1332 poly8x8x3_t
vld3_p8(__transfersize(24) poly8_t
const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
1333 poly16x4x3_t
vld3_p16(__transfersize(12) poly16_t
const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
1334 uint8x16x4_t
vld4q_u8(__transfersize(64) uint8_t const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
1335 uint16x8x4_t
vld4q_u16(__transfersize(32) uint16_t const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
1336 uint32x4x4_t
vld4q_u32(__transfersize(16) uint32_t const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
1337 int8x16x4_t
vld4q_s8(__transfersize(64) int8_t const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
1338 int16x8x4_t
vld4q_s16(__transfersize(32) int16_t const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
1339 int32x4x4_t
vld4q_s32(__transfersize(16) int32_t const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
1340 float16x8x4_t
vld4q_f16(__transfersize(32) __fp16
const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
1341 float32x4x4_t
vld4q_f32(__transfersize(16) float32_t
const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
1342 poly8x16x4_t
vld4q_p8(__transfersize(64) poly8_t
const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
1343 poly16x8x4_t
vld4q_p16(__transfersize(32) poly16_t
const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
1344 uint8x8x4_t
vld4_u8(__transfersize(32) uint8_t const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
1345 uint16x4x4_t
vld4_u16(__transfersize(16) uint16_t const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
1346 uint32x2x4_t
vld4_u32(__transfersize(8) uint32_t const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
1347 uint64x1x4_t
vld4_u64(__transfersize(4) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
1348 int8x8x4_t
vld4_s8(__transfersize(32) int8_t const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
1349 int16x4x4_t
vld4_s16(__transfersize(16) int16_t const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
1350 int32x2x4_t
vld4_s32(__transfersize(8) int32_t const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
1351 int64x1x4_t
vld4_s64(__transfersize(4) int64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
1352 float16x4x4_t
vld4_f16(__transfersize(16) __fp16
const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
1353 float32x2x4_t
vld4_f32(__transfersize(8) float32_t
const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
1354 poly8x8x4_t
vld4_p8(__transfersize(32) poly8_t
const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
1355 poly16x4x4_t
vld4_p16(__transfersize(16) poly16_t
const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
1356 //Load all lanes of N-element structure with same value from memory
1357 uint8x8x2_t
vld2_dup_u8(__transfersize(2) uint8_t const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
1358 uint16x4x2_t
vld2_dup_u16(__transfersize(2) uint16_t const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
1359 uint32x2x2_t
vld2_dup_u32(__transfersize(2) uint32_t const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
1360 uint64x1x2_t
vld2_dup_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1361 int8x8x2_t
vld2_dup_s8(__transfersize(2) int8_t const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
1362 int16x4x2_t
vld2_dup_s16(__transfersize(2) int16_t const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
1363 int32x2x2_t
vld2_dup_s32(__transfersize(2) int32_t const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
1364 int64x1x2_t
vld2_dup_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
1365 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
1366 float32x2x2_t
vld2_dup_f32(__transfersize(2) float32_t
const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
1367 poly8x8x2_t
vld2_dup_p8(__transfersize(2) poly8_t
const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
1368 poly16x4x2_t
vld2_dup_p16(__transfersize(2) poly16_t
const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
1369 uint8x8x3_t
vld3_dup_u8(__transfersize(3) uint8_t const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1370 uint16x4x3_t
vld3_dup_u16(__transfersize(3) uint16_t const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1371 uint32x2x3_t
vld3_dup_u32(__transfersize(3) uint32_t const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1372 uint64x1x3_t
vld3_dup_u64(__transfersize(3) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
1373 int8x8x3_t
vld3_dup_s8(__transfersize(3) int8_t const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1374 int16x4x3_t
vld3_dup_s16(__transfersize(3) int16_t const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1375 int32x2x3_t
vld3_dup_s32(__transfersize(3) int32_t const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1376 int64x1x3_t
vld3_dup_s64(__transfersize(3) int64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
1377 float16x4x3_t
vld3_dup_f16(__transfersize(3) __fp16
const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1378 float32x2x3_t
vld3_dup_f32(__transfersize(3) float32_t
const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
1379 poly8x8x3_t
vld3_dup_p8(__transfersize(3) poly8_t
const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
1380 poly16x4x3_t
vld3_dup_p16(__transfersize(3) poly16_t
const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
1381 uint8x8x4_t
vld4_dup_u8(__transfersize(4) uint8_t const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1382 uint16x4x4_t
vld4_dup_u16(__transfersize(4) uint16_t const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1383 uint32x2x4_t
vld4_dup_u32(__transfersize(4) uint32_t const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1384 uint64x1x4_t
vld4_dup_u64(__transfersize(4) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
1385 int8x8x4_t
vld4_dup_s8(__transfersize(4) int8_t const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1386 int16x4x4_t
vld4_dup_s16(__transfersize(4) int16_t const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1387 int32x2x4_t
vld4_dup_s32(__transfersize(4) int32_t const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1388 int64x1x4_t
vld4_dup_s64(__transfersize(4) int64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
1389 float16x4x4_t
vld4_dup_f16(__transfersize(4) __fp16
const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1390 float32x2x4_t
vld4_dup_f32(__transfersize(4) float32_t
const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
1391 poly8x8x4_t
vld4_dup_p8(__transfersize(4) poly8_t
const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
1392 poly16x4x4_t
vld4_dup_p16(__transfersize(4) poly16_t
const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
1393 //Load a single lane of N-element structure from memory
1394 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
1395 uint16x8x2_t
vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr
, uint16x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.16 {d0[0], d2[0]}, [r0]
1396 uint32x4x2_t
vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr
, uint32x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.32 {d0[0], d2[0]}, [r0]
1397 int16x8x2_t
vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr
, int16x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.16 {d0[0], d2[0]}, [r0]
1398 int32x4x2_t
vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr
, int32x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.32 {d0[0], d2[0]}, [r0]
1399 float16x8x2_t
vld2q_lane_f16_ptr(__transfersize(2) __fp16
const * ptr
, float16x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.16 {d0[0], d2[0]}, [r0]
1400 float32x4x2_t
vld2q_lane_f32_ptr(__transfersize(2) float32_t
const * ptr
, float32x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.32 {d0[0], d2[0]}, [r0]
1401 poly16x8x2_t
vld2q_lane_p16_ptr(__transfersize(2) poly16_t
const * ptr
, poly16x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.16 {d0[0], d2[0]}, [r0]
1402 uint8x8x2_t
vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr
, uint8x8x2_t
* src
, __constrange(0,7) int lane
); //VLD2.8 {d0[0], d1[0]}, [r0]
1403 uint16x4x2_t
vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr
, uint16x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.16 {d0[0], d1[0]}, [r0]
1404 uint32x2x2_t
vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr
, uint32x2x2_t
* src
, __constrange(0,1) int lane
); // VLD2.32 {d0[0], d1[0]}, [r0]
1405 int8x8x2_t
vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr
, int8x8x2_t
* src
, __constrange(0,7) int lane
); //VLD2.8 {d0[0], d1[0]}, [r0]
1406 int16x4x2_t
vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr
, int16x4x2_t
* src
, __constrange(0,3) int lane
); //VLD2.16 {d0[0], d1[0]}, [r0]
1407 int32x2x2_t
vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr
, int32x2x2_t
* src
, __constrange(0,1) int lane
); //VLD2.32 {d0[0], d1[0]}, [r0]
1408 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
1409 float32x2x2_t
vld2_lane_f32_ptr(__transfersize(2) float32_t
const * ptr
, float32x2x2_t
* src
, __constrange(0,1) int lane
); // VLD2.32 {d0[0], d1[0]}, [r0]
1410 poly8x8x2_t
vld2_lane_p8_ptr(__transfersize(2) poly8_t
const * ptr
, poly8x8x2_t
* src
, __constrange(0,7) int lane
); //VLD2.8 {d0[0], d1[0]}, [r0]
1411 poly16x4x2_t
vld2_lane_p16_ptr(__transfersize(2) poly16_t
const * ptr
, poly16x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.16 {d0[0], d1[0]}, [r0]
1412 uint16x8x3_t
vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr
, uint16x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1413 uint32x4x3_t
vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr
, uint32x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1414 int16x8x3_t
vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr
, int16x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1415 int32x4x3_t
vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr
, int32x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1416 float16x8x3_t
vld3q_lane_f16_ptr(__transfersize(3) __fp16
const * ptr
, float16x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1417 float32x4x3_t
vld3q_lane_f32_ptr(__transfersize(3) float32_t
const * ptr
, float32x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
1418 poly16x8x3_t
vld3q_lane_p16_ptr(__transfersize(3) poly16_t
const * ptr
, poly16x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
1419 uint8x8x3_t
vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr
, uint8x8x3_t
* src
, __constrange(0,7) int lane
); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1420 uint16x4x3_t
vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr
, uint16x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1421 uint32x2x3_t
vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr
, uint32x2x3_t
* src
, __constrange(0,1) int lane
); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1422 int8x8x3_t
vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr
, int8x8x3_t
* src
, __constrange(0,7) int lane
); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1423 int16x4x3_t
vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr
, int16x4x3_t
* src
, __constrange(0,3) int lane
); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1424 int32x2x3_t
vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr
, int32x2x3_t
* src
, __constrange(0,1) int lane
); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1425 float16x4x3_t
vld3_lane_f16_ptr(__transfersize(3) __fp16
const * ptr
, float16x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1426 float32x2x3_t
vld3_lane_f32_ptr(__transfersize(3) float32_t
const * ptr
, float32x2x3_t
* src
, __constrange(0,1) int lane
); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
1427 poly8x8x3_t
vld3_lane_p8_ptr(__transfersize(3) poly8_t
const * ptr
, poly8x8x3_t
* src
, __constrange(0,7) int lane
); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
1428 poly16x4x3_t
vld3_lane_p16_ptr(__transfersize(3) poly16_t
const * ptr
, poly16x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
1429 uint16x8x4_t
vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr
, uint16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1430 uint32x4x4_t
vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr
, uint32x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1431 int16x8x4_t
vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr
, int16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1432 int32x4x4_t
vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr
, int32x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1433 float16x8x4_t
vld4q_lane_f16_ptr(__transfersize(4) __fp16
const * ptr
, float16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1434 float32x4x4_t
vld4q_lane_f32_ptr(__transfersize(4) float32_t
const * ptr
, float32x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1435 poly16x8x4_t
vld4q_lane_p16_ptr(__transfersize(4) poly16_t
const * ptr
, poly16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1436 uint8x8x4_t
vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr
, uint8x8x4_t
* src
, __constrange(0,7) int lane
); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1437 uint16x4x4_t
vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr
, uint16x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1438 uint32x2x4_t
vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr
, uint32x2x4_t
* src
, __constrange(0,1) int lane
); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1439 int8x8x4_t
vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr
, int8x8x4_t
* src
, __constrange(0,7) int lane
); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1440 int16x4x4_t
vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr
, int16x4x4_t
* src
, __constrange(0,3) int lane
); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1441 int32x2x4_t
vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr
, int32x2x4_t
* src
, __constrange(0,1) int lane
); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1442 float16x4x4_t
vld4_lane_f16_ptr(__transfersize(4) __fp16
const * ptr
, float16x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1443 float32x2x4_t
vld4_lane_f32_ptr(__transfersize(4) float32_t
const * ptr
, float32x2x4_t
* src
, __constrange(0,1) int lane
); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1444 poly8x8x4_t
vld4_lane_p8_ptr(__transfersize(4) poly8_t
const * ptr
, poly8x8x4_t
* src
, __constrange(0,7) int lane
); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1445 poly16x4x4_t
vld4_lane_p16_ptr(__transfersize(4) poly16_t
const * ptr
, poly16x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
1446 //Store N-element structure to memory
1447 void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr
, uint8x16x2_t
* val
); // VST2.8 {d0, d2}, [r0]
1448 void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr
, uint16x8x2_t
* val
); // VST2.16 {d0, d2}, [r0]
1449 void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr
, uint32x4x2_t
* val
); // VST2.32 {d0, d2}, [r0]
1450 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr
, int8x16x2_t
* val
); // VST2.8 {d0, d2}, [r0]
1451 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr
, int16x8x2_t
* val
); // VST2.16 {d0, d2}, [r0]
1452 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr
, int32x4x2_t
* val
); // VST2.32 {d0, d2}, [r0]
1453 void vst2q_f16_ptr(__transfersize(16) __fp16
* ptr
, float16x8x2_t
* val
); // VST2.16 {d0, d2}, [r0]
1454 void vst2q_f32_ptr(__transfersize(8) float32_t
* ptr
, float32x4x2_t
* val
); // VST2.32 {d0, d2}, [r0]
1455 void vst2q_p8_ptr(__transfersize(32) poly8_t
* ptr
, poly8x16x2_t
* val
); // VST2.8 {d0, d2}, [r0]
1456 void vst2q_p16_ptr(__transfersize(16) poly16_t
* ptr
, poly16x8x2_t
* val
); // VST2.16 {d0, d2}, [r0]
1457 void vst2_u8_ptr(__transfersize(16) uint8_t * ptr
, uint8x8x2_t
* val
); // VST2.8 {d0, d1}, [r0]
1458 void vst2_u16_ptr(__transfersize(8) uint16_t * ptr
, uint16x4x2_t
* val
); // VST2.16 {d0, d1}, [r0]
1459 void vst2_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x2x2_t
* val
); // VST2.32 {d0, d1}, [r0]
1460 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr
, uint64x1x2_t
* val
); // VST1.64 {d0, d1}, [r0]
1461 void vst2_s8_ptr(__transfersize(16) int8_t * ptr
, int8x8x2_t
* val
); // VST2.8 {d0, d1}, [r0]
1462 void vst2_s16_ptr(__transfersize(8) int16_t * ptr
, int16x4x2_t
* val
); // VST2.16 {d0, d1}, [r0]
1463 void vst2_s32_ptr(__transfersize(4) int32_t * ptr
, int32x2x2_t
* val
); // VST2.32 {d0, d1}, [r0]
1464 void vst2_s64_ptr(__transfersize(2) int64_t * ptr
, int64x1x2_t
* val
); // VST1.64 {d0, d1}, [r0]
1465 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
1466 void vst2_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x2x2_t
* val
); // VST2.32 {d0, d1}, [r0]
1467 void vst2_p8_ptr(__transfersize(16) poly8_t
* ptr
, poly8x8x2_t
* val
); // VST2.8 {d0, d1}, [r0]
1468 void vst2_p16_ptr(__transfersize(8) poly16_t
* ptr
, poly16x4x2_t
* val
); // VST2.16 {d0, d1}, [r0]
1469 void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr
, uint8x16x3_t
* val
); // VST3.8 {d0, d2, d4}, [r0]
1470 void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr
, uint16x8x3_t
* val
); // VST3.16 {d0, d2, d4}, [r0]
1471 void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr
, uint32x4x3_t
* val
); // VST3.32 {d0, d2, d4}, [r0]
1472 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr
, int8x16x3_t
* val
); // VST3.8 {d0, d2, d4}, [r0]
1473 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr
, int16x8x3_t
* val
); // VST3.16 {d0, d2, d4}, [r0]
1474 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr
, int32x4x3_t
* val
); // VST3.32 {d0, d2, d4}, [r0]
1475 void vst3q_f16_ptr(__transfersize(24) __fp16
* ptr
, float16x8x3_t
* val
); // VST3.16 {d0, d2, d4}, [r0]
1476 void vst3q_f32_ptr(__transfersize(12) float32_t
* ptr
, float32x4x3_t
* val
); // VST3.32 {d0, d2, d4}, [r0]
1477 void vst3q_p8_ptr(__transfersize(48) poly8_t
* ptr
, poly8x16x3_t
* val
); // VST3.8 {d0, d2, d4}, [r0]
1478 void vst3q_p16_ptr(__transfersize(24) poly16_t
* ptr
, poly16x8x3_t
* val
); // VST3.16 {d0, d2, d4}, [r0]
1479 void vst3_u8_ptr(__transfersize(24) uint8_t * ptr
, uint8x8x3_t
* val
); // VST3.8 {d0, d1, d2}, [r0]
1480 void vst3_u16_ptr(__transfersize(12) uint16_t * ptr
, uint16x4x3_t
* val
); // VST3.16 {d0, d1, d2}, [r0]
1481 void vst3_u32_ptr(__transfersize(6) uint32_t * ptr
, uint32x2x3_t
* val
); // VST3.32 {d0, d1, d2}, [r0]
1482 void vst3_u64_ptr(__transfersize(3) uint64_t * ptr
, uint64x1x3_t
* val
); // VST1.64 {d0, d1, d2}, [r0]
1483 void vst3_s8_ptr(__transfersize(24) int8_t * ptr
, int8x8x3_t
* val
); // VST3.8 {d0, d1, d2}, [r0]
1484 void vst3_s16_ptr(__transfersize(12) int16_t * ptr
, int16x4x3_t
* val
); // VST3.16 {d0, d1, d2}, [r0]
1485 void vst3_s32_ptr(__transfersize(6) int32_t * ptr
, int32x2x3_t
* val
); // VST3.32 {d0, d1, d2}, [r0]
1486 void vst3_s64_ptr(__transfersize(3) int64_t * ptr
, int64x1x3_t
* val
); // VST1.64 {d0, d1, d2}, [r0]
1487 void vst3_f16_ptr(__transfersize(12) __fp16
* ptr
, float16x4x3_t
* val
); // VST3.16 {d0, d1, d2}, [r0]
1488 void vst3_f32_ptr(__transfersize(6) float32_t
* ptr
, float32x2x3_t
* val
); // VST3.32 {d0, d1, d2}, [r0]
1489 void vst3_p8_ptr(__transfersize(24) poly8_t
* ptr
, poly8x8x3_t
* val
); // VST3.8 {d0, d1, d2}, [r0]
1490 void vst3_p16_ptr(__transfersize(12) poly16_t
* ptr
, poly16x4x3_t
* val
); // VST3.16 {d0, d1, d2}, [r0]
1491 void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr
, uint8x16x4_t
* val
); // VST4.8 {d0, d2, d4, d6}, [r0]
1492 void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr
, uint16x8x4_t
* val
); // VST4.16 {d0, d2, d4, d6}, [r0]
1493 void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr
, uint32x4x4_t
* val
); // VST4.32 {d0, d2, d4, d6}, [r0]
1494 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr
, int8x16x4_t
* val
); // VST4.8 {d0, d2, d4, d6}, [r0]
1495 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr
, int16x8x4_t
* val
); // VST4.16 {d0, d2, d4, d6}, [r0]
1496 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr
, int32x4x4_t
* val
); // VST4.32 {d0, d2, d4, d6}, [r0]
1497 void vst4q_f16_ptr(__transfersize(32) __fp16
* ptr
, float16x8x4_t
* val
); // VST4.16 {d0, d2, d4, d6}, [r0]
1498 void vst4q_f32_ptr(__transfersize(16) float32_t
* ptr
, float32x4x4_t
* val
); // VST4.32 {d0, d2, d4, d6}, [r0]
1499 void vst4q_p8_ptr(__transfersize(64) poly8_t
* ptr
, poly8x16x4_t
* val
); // VST4.8 {d0, d2, d4, d6}, [r0]
1500 void vst4q_p16_ptr(__transfersize(32) poly16_t
* ptr
, poly16x8x4_t
* val
); // VST4.16 {d0, d2, d4, d6}, [r0]
1501 void vst4_u8_ptr(__transfersize(32) uint8_t * ptr
, uint8x8x4_t
* val
); // VST4.8 {d0, d1, d2, d3}, [r0]
1502 void vst4_u16_ptr(__transfersize(16) uint16_t * ptr
, uint16x4x4_t
* val
); // VST4.16 {d0, d1, d2, d3}, [r0]
1503 void vst4_u32_ptr(__transfersize(8) uint32_t * ptr
, uint32x2x4_t
* val
); // VST4.32 {d0, d1, d2, d3}, [r0]
1504 void vst4_u64_ptr(__transfersize(4) uint64_t * ptr
, uint64x1x4_t
* val
); // VST1.64 {d0, d1, d2, d3}, [r0]
1505 void vst4_s8_ptr(__transfersize(32) int8_t * ptr
, int8x8x4_t
* val
); // VST4.8 {d0, d1, d2, d3}, [r0]
1506 void vst4_s16_ptr(__transfersize(16) int16_t * ptr
, int16x4x4_t
* val
); // VST4.16 {d0, d1, d2, d3}, [r0]
1507 void vst4_s32_ptr(__transfersize(8) int32_t * ptr
, int32x2x4_t
* val
); // VST4.32 {d0, d1, d2, d3}, [r0]
1508 void vst4_s64_ptr(__transfersize(4) int64_t * ptr
, int64x1x4_t
* val
); // VST1.64 {d0, d1, d2, d3}, [r0]
1509 void vst4_f16_ptr(__transfersize(16) __fp16
* ptr
, float16x4x4_t
* val
); // VST4.16 {d0, d1, d2, d3}, [r0]
1510 void vst4_f32_ptr(__transfersize(8) float32_t
* ptr
, float32x2x4_t
* val
); // VST4.32 {d0, d1, d2, d3}, [r0]
1511 void vst4_p8_ptr(__transfersize(32) poly8_t
* ptr
, poly8x8x4_t
* val
); // VST4.8 {d0, d1, d2, d3}, [r0]
1512 void vst4_p16_ptr(__transfersize(16) poly16_t
* ptr
, poly16x4x4_t
* val
); // VST4.16 {d0, d1, d2, d3}, [r0]
1513 //Store a single lane of N-element structure to memory
1514 void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr
, uint16x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.16{d0[0], d2[0]}, [r0]
1515 void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr
, uint32x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.32{d0[0], d2[0]}, [r0]
1516 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr
, int16x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.16{d0[0], d2[0]}, [r0]
1517 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr
, int32x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.32{d0[0], d2[0]}, [r0]
1518 void vst2q_lane_f16_ptr(__transfersize(2) __fp16
* ptr
, float16x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.16{d0[0], d2[0]}, [r0]
1519 void vst2q_lane_f32_ptr(__transfersize(2) float32_t
* ptr
, float32x4x2_t
* val
, __constrange(0,3) int lane
); //VST2.32 {d0[0], d2[0]}, [r0]
1520 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t
* ptr
, poly16x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.16{d0[0], d2[0]}, [r0]
1521 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr
, uint8x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.8{d0[0], d1[0]}, [r0]
1522 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr
, uint16x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.16{d0[0], d1[0]}, [r0]
1523 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr
, uint32x2x2_t
* val
, __constrange(0,1) int lane
); // VST2.32{d0[0], d1[0]}, [r0]
1524 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr
, int8x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.8 {d0[0],d1[0]}, [r0]
1525 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr
, int16x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.16{d0[0], d1[0]}, [r0]
1526 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr
, int32x2x2_t
* val
, __constrange(0,1) int lane
); // VST2.32{d0[0], d1[0]}, [r0]
1527 void vst2_lane_f16_ptr(__transfersize(2) __fp16
* ptr
, float16x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.16{d0[0], d1[0]}, [r0]
1528 void vst2_lane_f32_ptr(__transfersize(2) float32_t
* ptr
, float32x2x2_t
* val
, __constrange(0,1) int lane
); // VST2.32{d0[0], d1[0]}, [r0]
1529 void vst2_lane_p8_ptr(__transfersize(2) poly8_t
* ptr
, poly8x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.8{d0[0], d1[0]}, [r0]
1530 void vst2_lane_p16_ptr(__transfersize(2) poly16_t
* ptr
, poly16x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.16{d0[0], d1[0]}, [r0]
1531 void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr
, uint16x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1532 void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr
, uint32x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1533 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr
, int16x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1534 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr
, int32x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
1535 void vst3q_lane_f16_ptr(__transfersize(3) __fp16
* ptr
, float16x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1536 void vst3q_lane_f32_ptr(__transfersize(3) float32_t
* ptr
, float32x4x3_t
* val
, __constrange(0,3) int lane
); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
1537 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t
* ptr
, poly16x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
1538 void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr
, uint8x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1539 void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr
, uint16x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1540 void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr
, uint32x2x3_t
* val
, __constrange(0,1) int lane
); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1541 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr
, int8x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
1542 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr
, int16x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1543 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr
, int32x2x3_t
* val
, __constrange(0,1) int lane
); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1544 void vst3_lane_f16_ptr(__transfersize(3) __fp16
* ptr
, float16x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1545 void vst3_lane_f32_ptr(__transfersize(3) float32_t
* ptr
, float32x2x3_t
* val
, __constrange(0,1) int lane
); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
1546 void vst3_lane_p8_ptr(__transfersize(3) poly8_t
* ptr
, poly8x8x3_t
* val
, __constrange(0,7) int lane
); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
1547 void vst3_lane_p16_ptr(__transfersize(3) poly16_t
* ptr
, poly16x4x3_t
* val
, __constrange(0,3) int lane
); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
1548 void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr
, uint16x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1549 void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1550 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr
, int16x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1551 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr
, int32x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
1552 void vst4q_lane_f16_ptr(__transfersize(4) __fp16
* ptr
, float16x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1553 void vst4q_lane_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x4x4_t
* val
, __constrange(0,3) int lane
); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
1554 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t
* ptr
, poly16x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
1555 void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr
, uint8x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1556 void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr
, uint16x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1557 void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x2x4_t
* val
, __constrange(0,1) int lane
); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1558 void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr
, int8x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
1559 void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr
, int16x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1560 void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr
, int32x2x4_t
* val
, __constrange(0,1) int lane
); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1561 void vst4_lane_f16_ptr(__transfersize(4) __fp16
* ptr
, float16x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1562 void vst4_lane_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x2x4_t
* val
, __constrange(0,1) int lane
); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
1563 void vst4_lane_p8_ptr(__transfersize(4) poly8_t
* ptr
, poly8x8x4_t
* val
, __constrange(0,7) int lane
); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
1564 void vst4_lane_p16_ptr(__transfersize(4) poly16_t
* ptr
, poly16x4x4_t
* val
, __constrange(0,3) int lane
); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
1565 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
1566 uint8_t vget_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VMOV.U8 r0, d0[0]
1567 uint16_t vget_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VMOV.U16 r0, d0[0]
1568 uint32_t vget_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
1569 int8_t vget_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VMOV.S8 r0, d0[0]
1570 int16_t vget_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VMOV.S16 r0, d0[0]
1571 int32_t vget_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
1572 poly8_t
vget_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VMOV.U8 r0, d0[0]
1573 poly16_t
vget_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VMOV.U16 r0, d0[0]
1574 float32_t
vget_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
1575 uint8_t vgetq_lane_u8(uint8x16_t vec
, __constrange(0,15) int lane
); // VMOV.U8 r0, d0[0]
1576 uint16_t vgetq_lane_u16(uint16x8_t vec
, __constrange(0,7) int lane
); // VMOV.U16 r0, d0[0]
1577 uint32_t vgetq_lane_u32(uint32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
1578 int8_t vgetq_lane_s8(int8x16_t vec
, __constrange(0,15) int lane
); // VMOV.S8 r0, d0[0]
1579 int16_t vgetq_lane_s16(int16x8_t vec
, __constrange(0,7) int lane
); // VMOV.S16 r0, d0[0]
1580 int32_t vgetq_lane_s32(int32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
1581 poly8_t
vgetq_lane_p8(poly8x16_t vec
, __constrange(0,15) int lane
); // VMOV.U8 r0, d0[0]
1582 poly16_t
vgetq_lane_p16(poly16x8_t vec
, __constrange(0,7) int lane
); // VMOV.U16 r0, d0[0]
1583 float32_t
vgetq_lane_f32(float32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
1584 int64_t vget_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV r0,r0,d0
1585 uint64_t vget_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV r0,r0,d0
1586 int64_t vgetq_lane_s64(int64x2_t vec
, __constrange(0,1) int lane
); // VMOV r0,r0,d0
1587 uint64_t vgetq_lane_u64(uint64x2_t vec
, __constrange(0,1) int lane
); // VMOV r0,r0,d0
1588 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
1589 uint8x8_t
vset_lane_u8(uint8_t value
, uint8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
1590 uint16x4_t
vset_lane_u16(uint16_t value
, uint16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
1591 uint32x2_t
vset_lane_u32(uint32_t value
, uint32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
1592 int8x8_t
vset_lane_s8(int8_t value
, int8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
1593 int16x4_t
vset_lane_s16(int16_t value
, int16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
1594 int32x2_t
vset_lane_s32(int32_t value
, int32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
1595 poly8x8_t
vset_lane_p8(poly8_t value
, poly8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
1596 poly16x4_t
vset_lane_p16(poly16_t value
, poly16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
1597 float32x2_t
vset_lane_f32(float32_t value
, float32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
1598 uint8x16_t
vsetq_lane_u8(uint8_t value
, uint8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
1599 uint16x8_t
vsetq_lane_u16(uint16_t value
, uint16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
1600 uint32x4_t
vsetq_lane_u32(uint32_t value
, uint32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
1601 int8x16_t
vsetq_lane_s8(int8_t value
, int8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
1602 int16x8_t
vsetq_lane_s16(int16_t value
, int16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
1603 int32x4_t
vsetq_lane_s32(int32_t value
, int32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
1604 poly8x16_t
vsetq_lane_p8(poly8_t value
, poly8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
1605 poly16x8_t
vsetq_lane_p16(poly16_t value
, poly16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
1606 float32x4_t
vsetq_lane_f32(float32_t value
, float32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
1607 int64x1_t
vset_lane_s64(int64_t value
, int64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,r0,r0
1608 uint64x1_t
vset_lane_u64(uint64_t value
, uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,r0,r0
1609 int64x2_t
vsetq_lane_s64(int64_t value
, int64x2_t vec
, __constrange(0,1) int lane
); // VMOV d0,r0,r0
1610 uint64x2_t
vsetq_lane_u64(uint64_t value
, uint64x2_t vec
, __constrange(0,1) int lane
); // VMOV d0,r0,r0
1611 //Initialize a vector from a literal bit pattern.
1612 int8x8_t
vcreate_s8(uint64_t a
); // VMOV d0,r0,r0
1613 int16x4_t
vcreate_s16(uint64_t a
); // VMOV d0,r0,r0
1614 int32x2_t
vcreate_s32(uint64_t a
); // VMOV d0,r0,r0
1615 float16x4_t
vcreate_f16(uint64_t a
); // VMOV d0,r0,r0
1616 float32x2_t
vcreate_f32(uint64_t a
); // VMOV d0,r0,r0
1617 uint8x8_t
vcreate_u8(uint64_t a
); // VMOV d0,r0,r0
1618 uint16x4_t
vcreate_u16(uint64_t a
); // VMOV d0,r0,r0
1619 uint32x2_t
vcreate_u32(uint64_t a
); // VMOV d0,r0,r0
1620 uint64x1_t
vcreate_u64(uint64_t a
); // VMOV d0,r0,r0
1621 poly8x8_t
vcreate_p8(uint64_t a
); // VMOV d0,r0,r0
1622 poly16x4_t
vcreate_p16(uint64_t a
); // VMOV d0,r0,r0
1623 int64x1_t
vcreate_s64(uint64_t a
); // VMOV d0,r0,r0
1624 //Set all lanes to same value
1625 //Load all lanes of vector to the same literal value
1626 uint8x8_t
vdup_n_u8(uint8_t value
); // VDUP.8 d0,r0
1627 uint16x4_t
vdup_n_u16(uint16_t value
); // VDUP.16 d0,r0
1628 uint32x2_t
vdup_n_u32(uint32_t value
); // VDUP.32 d0,r0
1629 int8x8_t
vdup_n_s8(int8_t value
); // VDUP.8 d0,r0
1630 int16x4_t
vdup_n_s16(int16_t value
); // VDUP.16 d0,r0
1631 int32x2_t
vdup_n_s32(int32_t value
); // VDUP.32 d0,r0
1632 poly8x8_t
vdup_n_p8(poly8_t value
); // VDUP.8 d0,r0
1633 poly16x4_t
vdup_n_p16(poly16_t value
); // VDUP.16 d0,r0
1634 float32x2_t
vdup_n_f32(float32_t value
); // VDUP.32 d0,r0
1635 uint8x16_t
vdupq_n_u8(uint8_t value
); // VDUP.8 q0,r0
1636 uint16x8_t
vdupq_n_u16(uint16_t value
); // VDUP.16 q0,r0
1637 uint32x4_t
vdupq_n_u32(uint32_t value
); // VDUP.32 q0,r0
1638 int8x16_t
vdupq_n_s8(int8_t value
); // VDUP.8 q0,r0
1639 int16x8_t
vdupq_n_s16(int16_t value
); // VDUP.16 q0,r0
1640 int32x4_t
vdupq_n_s32(int32_t value
); // VDUP.32 q0,r0
1641 poly8x16_t
vdupq_n_p8(poly8_t value
); // VDUP.8 q0,r0
1642 poly16x8_t
vdupq_n_p16(poly16_t value
); // VDUP.16 q0,r0
1643 float32x4_t
vdupq_n_f32(float32_t value
); // VDUP.32 q0,r0
1644 int64x1_t
vdup_n_s64(int64_t value
); // VMOV d0,r0,r0
1645 uint64x1_t
vdup_n_u64(uint64_t value
); // VMOV d0,r0,r0
1646 int64x2_t
vdupq_n_s64(int64_t value
); // VMOV d0,r0,r0
1647 uint64x2_t
vdupq_n_u64(uint64_t value
); // VMOV d0,r0,r0
1648 uint8x8_t
vmov_n_u8(uint8_t value
); // VDUP.8 d0,r0
1649 uint16x4_t
vmov_n_u16(uint16_t value
); // VDUP.16 d0,r0
1650 uint32x2_t
vmov_n_u32(uint32_t value
); // VDUP.32 d0,r0
1651 int8x8_t
vmov_n_s8(int8_t value
); // VDUP.8 d0,r0
1652 int16x4_t
vmov_n_s16(int16_t value
); // VDUP.16 d0,r0
1653 int32x2_t
vmov_n_s32(int32_t value
); // VDUP.32 d0,r0
1654 poly8x8_t
vmov_n_p8(poly8_t value
); // VDUP.8 d0,r0
1655 poly16x4_t
vmov_n_p16(poly16_t value
); // VDUP.16 d0,r0
1656 float32x2_t
vmov_n_f32(float32_t value
); // VDUP.32 d0,r0
1657 uint8x16_t
vmovq_n_u8(uint8_t value
); // VDUP.8 q0,r0
1658 uint16x8_t
vmovq_n_u16(uint16_t value
); // VDUP.16 q0,r0
1659 uint32x4_t
vmovq_n_u32(uint32_t value
); // VDUP.32 q0,r0
1660 int8x16_t
vmovq_n_s8(int8_t value
); // VDUP.8 q0,r0
1661 int16x8_t
vmovq_n_s16(int16_t value
); // VDUP.16 q0,r0
1662 int32x4_t
vmovq_n_s32(int32_t value
); // VDUP.32 q0,r0
1663 poly8x16_t
vmovq_n_p8(poly8_t value
); // VDUP.8 q0,r0
1664 poly16x8_t
vmovq_n_p16(poly16_t value
); // VDUP.16 q0,r0
1665 float32x4_t
vmovq_n_f32(float32_t value
); // VDUP.32 q0,r0
1666 int64x1_t
vmov_n_s64(int64_t value
); // VMOV d0,r0,r0
1667 uint64x1_t
vmov_n_u64(uint64_t value
); // VMOV d0,r0,r0
1668 int64x2_t
vmovq_n_s64(int64_t value
); // VMOV d0,r0,r0
1669 uint64x2_t
vmovq_n_u64(uint64_t value
); // VMOV d0,r0,r0
1670 //Load all lanes of the vector to the value of a lane of a vector
1671 uint8x8_t
vdup_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
1672 uint16x4_t
vdup_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
1673 uint32x2_t
vdup_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
1674 int8x8_t
vdup_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
1675 int16x4_t
vdup_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
1676 int32x2_t
vdup_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
1677 poly8x8_t
vdup_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
1678 poly16x4_t
vdup_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
1679 float32x2_t
vdup_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
1680 uint8x16_t
vdupq_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
1681 uint16x8_t
vdupq_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
1682 uint32x4_t
vdupq_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
1683 int8x16_t
vdupq_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
1684 int16x8_t
vdupq_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
1685 int32x4_t
vdupq_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
1686 poly8x16_t
vdupq_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
1687 poly16x8_t
vdupq_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
1688 float32x4_t
vdupq_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
1689 int64x1_t
vdup_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,d0
1690 uint64x1_t
vdup_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,d0
1691 int64x2_t
vdupq_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV q0,q0
1692 uint64x2_t
vdupq_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV q0,q0
1693 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
1694 int8x16_t
vcombine_s8(int8x8_t low
, int8x8_t high
); // VMOV d0,d0
1695 int16x8_t
vcombine_s16(int16x4_t low
, int16x4_t high
); // VMOV d0,d0
1696 int32x4_t
vcombine_s32(int32x2_t low
, int32x2_t high
); // VMOV d0,d0
1697 int64x2_t
vcombine_s64(int64x1_t low
, int64x1_t high
); // VMOV d0,d0
1698 float16x8_t
vcombine_f16(float16x4_t low
, float16x4_t high
); // VMOV d0,d0
1699 float32x4_t
vcombine_f32(float32x2_t low
, float32x2_t high
); // VMOV d0,d0
1700 uint8x16_t
vcombine_u8(uint8x8_t low
, uint8x8_t high
); // VMOV d0,d0
1701 uint16x8_t
vcombine_u16(uint16x4_t low
, uint16x4_t high
); // VMOV d0,d0
1702 uint32x4_t
vcombine_u32(uint32x2_t low
, uint32x2_t high
); // VMOV d0,d0
1703 uint64x2_t
vcombine_u64(uint64x1_t low
, uint64x1_t high
); // VMOV d0,d0
1704 poly8x16_t
vcombine_p8(poly8x8_t low
, poly8x8_t high
); // VMOV d0,d0
1705 poly16x8_t
vcombine_p16(poly16x4_t low
, poly16x4_t high
); // VMOV d0,d0
1706 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
1707 int8x8_t
vget_high_s8(int8x16_t a
); // VMOV d0,d0
1708 int16x4_t
vget_high_s16(int16x8_t a
); // VMOV d0,d0
1709 int32x2_t
vget_high_s32(int32x4_t a
); // VMOV d0,d0
1710 int64x1_t
vget_high_s64(int64x2_t a
); // VMOV d0,d0
1711 float16x4_t
vget_high_f16(float16x8_t a
); // VMOV d0,d0
1712 float32x2_t
vget_high_f32(float32x4_t a
); // VMOV d0,d0
1713 uint8x8_t
vget_high_u8(uint8x16_t a
); // VMOV d0,d0
1714 uint16x4_t
vget_high_u16(uint16x8_t a
); // VMOV d0,d0
1715 uint32x2_t
vget_high_u32(uint32x4_t a
); // VMOV d0,d0
1716 uint64x1_t
vget_high_u64(uint64x2_t a
); // VMOV d0,d0
1717 poly8x8_t
vget_high_p8(poly8x16_t a
); // VMOV d0,d0
1718 poly16x4_t
vget_high_p16(poly16x8_t a
); // VMOV d0,d0
1719 int8x8_t
vget_low_s8(int8x16_t a
); // VMOV d0,d0
1720 int16x4_t
vget_low_s16(int16x8_t a
); // VMOV d0,d0
1721 int32x2_t
vget_low_s32(int32x4_t a
); // VMOV d0,d0
1722 int64x1_t
vget_low_s64(int64x2_t a
); // VMOV d0,d0
1723 float16x4_t
vget_low_f16(float16x8_t a
); // VMOV d0,d0
1724 float32x2_t
vget_low_f32(float32x4_t a
); // VMOV d0,d0
1725 uint8x8_t
vget_low_u8(uint8x16_t a
); // VMOV d0,d0
1726 uint16x4_t
vget_low_u16(uint16x8_t a
); // VMOV d0,d0
1727 uint32x2_t
vget_low_u32(uint32x4_t a
); // VMOV d0,d0
1728 uint64x1_t
vget_low_u64(uint64x2_t a
); // VMOV d0,d0
1729 poly8x8_t
vget_low_p8(poly8x16_t a
); // VMOV d0,d0
1730 poly16x4_t
vget_low_p16(poly16x8_t a
); // VMOV d0,d0
1731 //Converting vectors. These intrinsics are used to convert vectors.
1732 //Convert from float
1733 int32x2_t
vcvt_s32_f32(float32x2_t a
); // VCVT.S32.F32 d0, d0
1734 uint32x2_t
vcvt_u32_f32(float32x2_t a
); // VCVT.U32.F32 d0, d0
1735 int32x4_t
vcvtq_s32_f32(float32x4_t a
); // VCVT.S32.F32 q0, q0
1736 uint32x4_t
vcvtq_u32_f32(float32x4_t a
); // VCVT.U32.F32 q0, q0
1737 int32x2_t
vcvt_n_s32_f32(float32x2_t a
, __constrange(1,32) int b
); // VCVT.S32.F32 d0, d0, #32
1738 uint32x2_t
vcvt_n_u32_f32(float32x2_t a
, __constrange(1,32) int b
); // VCVT.U32.F32 d0, d0, #32
1739 int32x4_t
vcvtq_n_s32_f32(float32x4_t a
, __constrange(1,32) int b
); // VCVT.S32.F32 q0, q0, #32
1740 uint32x4_t
vcvtq_n_u32_f32(float32x4_t a
, __constrange(1,32) int b
); // VCVT.U32.F32 q0, q0, #32
1742 float32x2_t
vcvt_f32_s32(int32x2_t a
); // VCVT.F32.S32 d0, d0
1743 float32x2_t
vcvt_f32_u32(uint32x2_t a
); // VCVT.F32.U32 d0, d0
1744 float32x4_t
vcvtq_f32_s32(int32x4_t a
); // VCVT.F32.S32 q0, q0
1745 float32x4_t
vcvtq_f32_u32(uint32x4_t a
); // VCVT.F32.U32 q0, q0
1746 float32x2_t
vcvt_n_f32_s32(int32x2_t a
, __constrange(1,32) int b
); // VCVT.F32.S32 d0, d0, #32
1747 float32x2_t
vcvt_n_f32_u32(uint32x2_t a
, __constrange(1,32) int b
); // VCVT.F32.U32 d0, d0, #32
1748 float32x4_t
vcvtq_n_f32_s32(int32x4_t a
, __constrange(1,32) int b
); // VCVT.F32.S32 q0, q0, #32
1749 float32x4_t
vcvtq_n_f32_u32(uint32x4_t a
, __constrange(1,32) int b
); // VCVT.F32.U32 q0, q0, #32
1750 //Convert between floats
1751 float16x4_t
vcvt_f16_f32(float32x4_t a
); // VCVT.F16.F32 d0, q0
1752 float32x4_t
vcvt_f32_f16(float16x4_t a
); // VCVT.F32.F16 q0, d0
1753 //Vector narrow integer
1754 int8x8_t
vmovn_s16(int16x8_t a
); // VMOVN.I16 d0,q0
1755 int16x4_t
vmovn_s32(int32x4_t a
); // VMOVN.I32 d0,q0
1756 int32x2_t
vmovn_s64(int64x2_t a
); // VMOVN.I64 d0,q0
1757 uint8x8_t
vmovn_u16(uint16x8_t a
); // VMOVN.I16 d0,q0
1758 uint16x4_t
vmovn_u32(uint32x4_t a
); // VMOVN.I32 d0,q0
1759 uint32x2_t
vmovn_u64(uint64x2_t a
); // VMOVN.I64 d0,q0
1761 int16x8_t
vmovl_s8(int8x8_t a
); // VMOVL.S8 q0,d0
1762 int32x4_t
vmovl_s16(int16x4_t a
); // VMOVL.S16 q0,d0
1763 int64x2_t
vmovl_s32(int32x2_t a
); // VMOVL.S32 q0,d0
1764 uint16x8_t
vmovl_u8(uint8x8_t a
); // VMOVL.U8 q0,d0
1765 uint32x4_t
vmovl_u16(uint16x4_t a
); // VMOVL.U16 q0,d0
1766 uint64x2_t
vmovl_u32(uint32x2_t a
); // VMOVL.U32 q0,d0
1767 //Vector saturating narrow integer
1768 int8x8_t
vqmovn_s16(int16x8_t a
); // VQMOVN.S16 d0,q0
1769 int16x4_t
vqmovn_s32(int32x4_t a
); // VQMOVN.S32 d0,q0
1770 int32x2_t
vqmovn_s64(int64x2_t a
); // VQMOVN.S64 d0,q0
1771 uint8x8_t
vqmovn_u16(uint16x8_t a
); // VQMOVN.U16 d0,q0
1772 uint16x4_t
vqmovn_u32(uint32x4_t a
); // VQMOVN.U32 d0,q0
1773 uint32x2_t
vqmovn_u64(uint64x2_t a
); // VQMOVN.U64 d0,q0
1774 //Vector saturating narrow integer signed->unsigned
1775 uint8x8_t
vqmovun_s16(int16x8_t a
); // VQMOVUN.S16 d0,q0
1776 uint16x4_t
vqmovun_s32(int32x4_t a
); // VQMOVUN.S32 d0,q0
1777 uint32x2_t
vqmovun_s64(int64x2_t a
); // VQMOVUN.S64 d0,q0
1779 uint8x8_t
vtbl1_u8(uint8x8_t a
, uint8x8_t b
); // VTBL.8 d0, {d0}, d0
1780 int8x8_t
vtbl1_s8(int8x8_t a
, int8x8_t b
); // VTBL.8 d0, {d0}, d0
1781 poly8x8_t
vtbl1_p8(poly8x8_t a
, uint8x8_t b
); // VTBL.8 d0, {d0}, d0
1782 uint8x8_t
vtbl2_u8_ptr(uint8x8x2_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1}, d0
1783 int8x8_t
vtbl2_s8_ptr(int8x8x2_t
*a
, int8x8_t b
); // VTBL.8 d0, {d0, d1}, d0
1784 poly8x8_t
vtbl2_p8_ptr(poly8x8x2_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1}, d0
1785 uint8x8_t
vtbl3_u8_ptr(uint8x8x3_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2}, d0
1786 int8x8_t
vtbl3_s8_ptr(int8x8x3_t
*a
, int8x8_t b
); // VTBL.8 d0, {d0, d1, d2}, d0
1787 poly8x8_t
vtbl3_p8_ptr(poly8x8x3_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2}, d0
1788 uint8x8_t
vtbl4_u8_ptr(uint8x8x4_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1789 int8x8_t
vtbl4_s8_ptr(int8x8x4_t
*a
, int8x8_t b
); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1790 poly8x8_t
vtbl4_p8_ptr(poly8x8x4_t
*a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2, d3}, d0
1791 //Extended table look up intrinsics
1792 uint8x8_t
vtbx1_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VTBX.8 d0, {d0}, d0
1793 int8x8_t
vtbx1_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VTBX.8 d0, {d0}, d0
1794 poly8x8_t
vtbx1_p8(poly8x8_t a
, poly8x8_t b
, uint8x8_t c
); // VTBX.8 d0, {d0}, d0
1795 uint8x8_t
vtbx2_u8_ptr(uint8x8_t a
, uint8x8x2_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1}, d0
1796 int8x8_t
vtbx2_s8_ptr(int8x8_t a
, int8x8x2_t
*b
, int8x8_t c
); // VTBX.8 d0, {d0, d1}, d0
1797 poly8x8_t
vtbx2_p8_ptr(poly8x8_t a
, poly8x8x2_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1}, d0
1798 uint8x8_t
vtbx3_u8_ptr(uint8x8_t a
, uint8x8x3_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1, d2}, d0
1799 int8x8_t
vtbx3_s8_ptr(int8x8_t a
, int8x8x3_t
*b
, int8x8_t c
); // VTBX.8 d0, {d0, d1, d2}, d0
1800 poly8x8_t
vtbx3_p8_ptr(poly8x8_t a
, poly8x8x3_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1, d2}, d0
1801 uint8x8_t
vtbx4_u8_ptr(uint8x8_t a
, uint8x8x4_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1802 int8x8_t
vtbx4_s8_ptr(int8x8_t a
, int8x8x4_t
*b
, int8x8_t c
); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1803 poly8x8_t
vtbx4_p8_ptr(poly8x8_t a
, poly8x8x4_t
*b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1, d2, d3}, d0
1804 //Operations with a scalar value
1805 //Vector multiply accumulate with scalar
1806 int16x4_t
vmla_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 d0, d0,d0[0]
1807 int32x2_t
vmla_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 d0, d0,d0[0]
1808 uint16x4_t
vmla_lane_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 d0, d0,d0[0]
1809 uint32x2_t
vmla_lane_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 d0, d0,d0[0]
1810 float32x2_t
vmla_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLA.F32 d0,d0, d0[0]
1811 int16x8_t
vmlaq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 q0, q0,d0[0]
1812 int32x4_t
vmlaq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 q0, q0,d0[0]
1813 uint16x8_t
vmlaq_lane_u16(uint16x8_t a
, uint16x8_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 q0,q0, d0[0]
1814 uint32x4_t
vmlaq_lane_u32(uint32x4_t a
, uint32x4_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 q0,q0, d0[0]
1815 float32x4_t
vmlaq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLA.F32 q0,q0, d0[0]
1816 //Vector widening multiply accumulate with scalar
1817 int32x4_t
vmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); //VMLAL.S16 q0, d0,d0[0]
1818 int64x2_t
vmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); //VMLAL.S32 q0, d0,d0[0]
1819 uint32x4_t
vmlal_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLAL.U16 q0,d0, d0[0]
1820 uint64x2_t
vmlal_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLAL.U32 q0,d0, d0[0]
1821 //Vector widening saturating doubling multiply accumulate with scalar
1822 int32x4_t
vqdmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VQDMLAL.S16 q0,d0, d0[0]
1823 int64x2_t
vqdmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VQDMLAL.S32 q0,d0, d0[0]
1824 //Vector multiply subtract with scalar
1825 int16x4_t
vmls_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 d0, d0,d0[0]
1826 int32x2_t
vmls_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 d0, d0,d0[0]
1827 uint16x4_t
vmls_lane_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 d0, d0,d0[0]
1828 uint32x2_t
vmls_lane_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 d0, d0,d0[0]
1829 float32x2_t
vmls_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLS.F32 d0,d0, d0[0]
1830 int16x8_t
vmlsq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 q0, q0,d0[0]
1831 int32x4_t
vmlsq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 q0, q0,d0[0]
1832 uint16x8_t
vmlsq_lane_u16(uint16x8_t a
, uint16x8_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 q0,q0, d0[0]
1833 uint32x4_t
vmlsq_lane_u32(uint32x4_t a
, uint32x4_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 q0,q0, d0[0]
1834 float32x4_t
vmlsq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLS.F32 q0,q0, d0[0]
1835 //Vector widening multiply subtract with scalar
1836 int32x4_t
vmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLSL.S16 q0, d0,d0[0]
1837 int64x2_t
vmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLSL.S32 q0, d0,d0[0]
1838 uint32x4_t
vmlsl_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLSL.U16 q0,d0, d0[0]
1839 uint64x2_t
vmlsl_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLSL.U32 q0,d0, d0[0]
1840 //Vector widening saturating doubling multiply subtract with scalar
1841 int32x4_t
vqdmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VQDMLSL.S16 q0,d0, d0[0]
1842 int64x2_t
vqdmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VQDMLSL.S32 q0,d0, d0[0]
1843 //Vector multiply by scalar
1844 int16x4_t
vmul_n_s16(int16x4_t a
, int16_t b
); // VMUL.I16 d0,d0,d0[0]
1845 int32x2_t
vmul_n_s32(int32x2_t a
, int32_t b
); // VMUL.I32 d0,d0,d0[0]
1846 float32x2_t
vmul_n_f32(float32x2_t a
, float32_t b
); // VMUL.F32 d0,d0,d0[0]
1847 uint16x4_t
vmul_n_u16(uint16x4_t a
, uint16_t b
); // VMUL.I16 d0,d0,d0[0]
1848 uint32x2_t
vmul_n_u32(uint32x2_t a
, uint32_t b
); // VMUL.I32 d0,d0,d0[0]
1849 int16x8_t
vmulq_n_s16(int16x8_t a
, int16_t b
); // VMUL.I16 q0,q0,d0[0]
1850 int32x4_t
vmulq_n_s32(int32x4_t a
, int32_t b
); // VMUL.I32 q0,q0,d0[0]
1851 float32x4_t
vmulq_n_f32(float32x4_t a
, float32_t b
); // VMUL.F32 q0,q0,d0[0]
1852 uint16x8_t
vmulq_n_u16(uint16x8_t a
, uint16_t b
); // VMUL.I16 q0,q0,d0[0]
1853 uint32x4_t
vmulq_n_u32(uint32x4_t a
, uint32_t b
); // VMUL.I32 q0,q0,d0[0]
1854 //Vector long multiply with scalar
1855 int32x4_t
vmull_n_s16(int16x4_t vec1
, int16_t val2
); // VMULL.S16 q0,d0,d0[0]
1856 int64x2_t
vmull_n_s32(int32x2_t vec1
, int32_t val2
); // VMULL.S32 q0,d0,d0[0]
1857 uint32x4_t
vmull_n_u16(uint16x4_t vec1
, uint16_t val2
); // VMULL.U16 q0,d0,d0[0]
1858 uint64x2_t
vmull_n_u32(uint32x2_t vec1
, uint32_t val2
); // VMULL.U32 q0,d0,d0[0]
1859 //Vector long multiply by scalar
1860 int32x4_t
vmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VMULL.S16 q0,d0,d0[0]
1861 int64x2_t
vmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VMULL.S32 q0,d0,d0[0]
1862 uint32x4_t
vmull_lane_u16(uint16x4_t vec1
, uint16x4_t val2
, __constrange(0, 3) int val3
); // VMULL.U16 q0,d0,d0[0]
1863 uint64x2_t
vmull_lane_u32(uint32x2_t vec1
, uint32x2_t val2
, __constrange(0, 1) int val3
); // VMULL.U32 q0,d0,d0[0]
1864 //Vector saturating doubling long multiply with scalar
1865 int32x4_t
vqdmull_n_s16(int16x4_t vec1
, int16_t val2
); // VQDMULL.S16 q0,d0,d0[0]
1866 int64x2_t
vqdmull_n_s32(int32x2_t vec1
, int32_t val2
); // VQDMULL.S32 q0,d0,d0[0]
1867 //Vector saturating doubling long multiply by scalar
1868 int32x4_t
vqdmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULL.S16 q0,d0,d0[0]
1869 int64x2_t
vqdmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULL.S32 q0,d0,d0[0]
1870 //Vector saturating doubling multiply high with scalar
1871 int16x4_t
vqdmulh_n_s16(int16x4_t vec1
, int16_t val2
); // VQDMULH.S16 d0,d0,d0[0]
1872 int32x2_t
vqdmulh_n_s32(int32x2_t vec1
, int32_t val2
); // VQDMULH.S32 d0,d0,d0[0]
1873 int16x8_t
vqdmulhq_n_s16(int16x8_t vec1
, int16_t val2
); // VQDMULH.S16 q0,q0,d0[0]
1874 int32x4_t
vqdmulhq_n_s32(int32x4_t vec1
, int32_t val2
); // VQDMULH.S32 q0,q0,d0[0]
1875 //Vector saturating doubling multiply high by scalar
1876 int16x4_t
vqdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULH.S16 d0,d0,d0[0]
1877 int32x2_t
vqdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULH.S32 d0,d0,d0[0]
1878 int16x8_t
vqdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULH.S16 q0,q0,d0[0]
1879 int32x4_t
vqdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULH.S32 q0,q0,d0[0]
1880 //Vector saturating rounding doubling multiply high with scalar
1881 int16x4_t
vqrdmulh_n_s16(int16x4_t vec1
, int16_t val2
); // VQRDMULH.S16 d0,d0,d0[0]
1882 int32x2_t
vqrdmulh_n_s32(int32x2_t vec1
, int32_t val2
); // VQRDMULH.S32 d0,d0,d0[0]
1883 int16x8_t
vqrdmulhq_n_s16(int16x8_t vec1
, int16_t val2
); // VQRDMULH.S16 q0,q0,d0[0]
1884 int32x4_t
vqrdmulhq_n_s32(int32x4_t vec1
, int32_t val2
); // VQRDMULH.S32 q0,q0,d0[0]
1885 //Vector rounding saturating doubling multiply high by scalar
1886 int16x4_t
vqrdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQRDMULH.S16 d0,d0,d0[0]
1887 int32x2_t
vqrdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQRDMULH.S32 d0,d0,d0[0]
1888 int16x8_t
vqrdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQRDMULH.S16 q0,q0,d0[0]
1889 int32x4_t
vqrdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQRDMULH.S32 q0,q0,d0[0]
1890 //Vector multiply accumulate with scalar
1891 int16x4_t
vmla_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
); // VMLA.I16 d0, d0, d0[0]
1892 int32x2_t
vmla_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
); // VMLA.I32 d0, d0, d0[0]
1893 uint16x4_t
vmla_n_u16(uint16x4_t a
, uint16x4_t b
, uint16_t c
); // VMLA.I16 d0, d0, d0[0]
1894 uint32x2_t
vmla_n_u32(uint32x2_t a
, uint32x2_t b
, uint32_t c
); // VMLA.I32 d0, d0, d0[0]
1895 float32x2_t
vmla_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
); // VMLA.F32 d0, d0, d0[0]
1896 int16x8_t
vmlaq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
); // VMLA.I16 q0, q0, d0[0]
1897 int32x4_t
vmlaq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
); // VMLA.I32 q0, q0, d0[0]
1898 uint16x8_t
vmlaq_n_u16(uint16x8_t a
, uint16x8_t b
, uint16_t c
); // VMLA.I16 q0, q0, d0[0]
1899 uint32x4_t
vmlaq_n_u32(uint32x4_t a
, uint32x4_t b
, uint32_t c
); // VMLA.I32 q0, q0, d0[0]
1900 float32x4_t
vmlaq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
); // VMLA.F32 q0, q0, d0[0]
1901 //Vector widening multiply accumulate with scalar
1902 int32x4_t
vmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VMLAL.S16 q0, d0, d0[0]
1903 int64x2_t
vmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VMLAL.S32 q0, d0, d0[0]
1904 uint32x4_t
vmlal_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
); // VMLAL.U16 q0, d0, d0[0]
1905 uint64x2_t
vmlal_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
); // VMLAL.U32 q0, d0, d0[0]
1906 //Vector widening saturating doubling multiply accumulate with scalar
1907 int32x4_t
vqdmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VQDMLAL.S16 q0, d0, d0[0]
1908 int64x2_t
vqdmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VQDMLAL.S32 q0, d0, d0[0]
1909 //Vector multiply subtract with scalar
1910 int16x4_t
vmls_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
); // VMLS.I16 d0, d0, d0[0]
1911 int32x2_t
vmls_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
); // VMLS.I32 d0, d0, d0[0]
1912 uint16x4_t
vmls_n_u16(uint16x4_t a
, uint16x4_t b
, uint16_t c
); // VMLS.I16 d0, d0, d0[0]
1913 uint32x2_t
vmls_n_u32(uint32x2_t a
, uint32x2_t b
, uint32_t c
); // VMLS.I32 d0, d0, d0[0]
1914 float32x2_t
vmls_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
); // VMLS.F32 d0, d0, d0[0]
1915 int16x8_t
vmlsq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
); // VMLS.I16 q0, q0, d0[0]
1916 int32x4_t
vmlsq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
); // VMLS.I32 q0, q0, d0[0]
1917 uint16x8_t
vmlsq_n_u16(uint16x8_t a
, uint16x8_t b
, uint16_t c
); // VMLS.I16 q0, q0, d0[0]
1918 uint32x4_t
vmlsq_n_u32(uint32x4_t a
, uint32x4_t b
, uint32_t c
); // VMLS.I32 q0, q0, d0[0]
1919 float32x4_t
vmlsq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
); // VMLS.F32 q0, q0, d0[0]
1920 //Vector widening multiply subtract with scalar
1921 int32x4_t
vmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VMLSL.S16 q0, d0, d0[0]
1922 int64x2_t
vmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VMLSL.S32 q0, d0, d0[0]
1923 uint32x4_t
vmlsl_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
); // VMLSL.U16 q0, d0, d0[0]
1924 uint64x2_t
vmlsl_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
); // VMLSL.U32 q0, d0, d0[0]
1925 //Vector widening saturating doubling multiply subtract with scalar
1926 int32x4_t
vqdmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VQDMLSL.S16 q0, d0, d0[0]
1927 int64x2_t
vqdmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VQDMLSL.S32 q0, d0, d0[0]
1929 int8x8_t
vext_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
1930 uint8x8_t
vext_u8(uint8x8_t a
, uint8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
1931 poly8x8_t
vext_p8(poly8x8_t a
, poly8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
1932 int16x4_t
vext_s16(int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
1933 uint16x4_t
vext_u16(uint16x4_t a
, uint16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
1934 poly16x4_t
vext_p16(poly16x4_t a
, poly16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
1935 int32x2_t
vext_s32(int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
1936 uint32x2_t
vext_u32(uint32x2_t a
, uint32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
1937 int64x1_t
vext_s64(int64x1_t a
, int64x1_t b
, __constrange(0,0) int c
); // VEXT.64 d0,d0,d0,#0
1938 uint64x1_t
vext_u64(uint64x1_t a
, uint64x1_t b
, __constrange(0,0) int c
); // VEXT.64 d0,d0,d0,#0
1939 float32x2_t
vext_f32(float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
1940 int8x16_t
vextq_s8(int8x16_t a
, int8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
1941 uint8x16_t
vextq_u8(uint8x16_t a
, uint8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
1942 poly8x16_t
vextq_p8(poly8x16_t a
, poly8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
1943 int16x8_t
vextq_s16(int16x8_t a
, int16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
1944 uint16x8_t
vextq_u16(uint16x8_t a
, uint16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
1945 poly16x8_t
vextq_p16(poly16x8_t a
, poly16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
1946 int32x4_t
vextq_s32(int32x4_t a
, int32x4_t b
, __constrange(0,3) int c
); // VEXT.32 q0,q0,q0,#0
1947 uint32x4_t
vextq_u32(uint32x4_t a
, uint32x4_t b
, __constrange(0,3) int c
); // VEXT.32 q0,q0,q0,#0
1948 int64x2_t
vextq_s64(int64x2_t a
, int64x2_t b
, __constrange(0,1) int c
); // VEXT.64 q0,q0,q0,#0
1949 uint64x2_t
vextq_u64(uint64x2_t a
, uint64x2_t b
, __constrange(0,1) int c
); // VEXT.64 q0,q0,q0,#0
1950 float32x4_t
vextq_f32(float32x4_t a
, float32x4_t b
, __constrange(0,3) float c
); // VEXT.32 q0,q0,q0,#0
1951 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
1952 int8x8_t
vrev64_s8(int8x8_t vec
); // VREV64.8 d0,d0
1953 int16x4_t
vrev64_s16(int16x4_t vec
); // VREV64.16 d0,d0
1954 int32x2_t
vrev64_s32(int32x2_t vec
); // VREV64.32 d0,d0
1955 uint8x8_t
vrev64_u8(uint8x8_t vec
); // VREV64.8 d0,d0
1956 uint16x4_t
vrev64_u16(uint16x4_t vec
); // VREV64.16 d0,d0
1957 uint32x2_t
vrev64_u32(uint32x2_t vec
); // VREV64.32 d0,d0
1958 poly8x8_t
vrev64_p8(poly8x8_t vec
); // VREV64.8 d0,d0
1959 poly16x4_t
vrev64_p16(poly16x4_t vec
); // VREV64.16 d0,d0
1960 float32x2_t
vrev64_f32(float32x2_t vec
); // VREV64.32 d0,d0
1961 int8x16_t
vrev64q_s8(int8x16_t vec
); // VREV64.8 q0,q0
1962 int16x8_t
vrev64q_s16(int16x8_t vec
); // VREV64.16 q0,q0
1963 int32x4_t
vrev64q_s32(int32x4_t vec
); // VREV64.32 q0,q0
1964 uint8x16_t
vrev64q_u8(uint8x16_t vec
); // VREV64.8 q0,q0
1965 uint16x8_t
vrev64q_u16(uint16x8_t vec
); // VREV64.16 q0,q0
1966 uint32x4_t
vrev64q_u32(uint32x4_t vec
); // VREV64.32 q0,q0
1967 poly8x16_t
vrev64q_p8(poly8x16_t vec
); // VREV64.8 q0,q0
1968 poly16x8_t
vrev64q_p16(poly16x8_t vec
); // VREV64.16 q0,q0
1969 float32x4_t
vrev64q_f32(float32x4_t vec
); // VREV64.32 q0,q0
1970 int8x8_t
vrev32_s8(int8x8_t vec
); // VREV32.8 d0,d0
1971 int16x4_t
vrev32_s16(int16x4_t vec
); // VREV32.16 d0,d0
1972 uint8x8_t
vrev32_u8(uint8x8_t vec
); // VREV32.8 d0,d0
1973 uint16x4_t
vrev32_u16(uint16x4_t vec
); // VREV32.16 d0,d0
1974 poly8x8_t
vrev32_p8(poly8x8_t vec
); // VREV32.8 d0,d0
1975 poly16x4_t
vrev32_p16(poly16x4_t vec
); // VREV32.16 d0,d0
1976 int8x16_t
vrev32q_s8(int8x16_t vec
); // VREV32.8 q0,q0
1977 int16x8_t
vrev32q_s16(int16x8_t vec
); // VREV32.16 q0,q0
1978 uint8x16_t
vrev32q_u8(uint8x16_t vec
); // VREV32.8 q0,q0
1979 uint16x8_t
vrev32q_u16(uint16x8_t vec
); // VREV32.16 q0,q0
1980 poly8x16_t
vrev32q_p8(poly8x16_t vec
); // VREV32.8 q0,q0
1981 poly16x8_t
vrev32q_p16(poly16x8_t vec
); // VREV32.16 q0,q0
1982 int8x8_t
vrev16_s8(int8x8_t vec
); // VREV16.8 d0,d0
1983 uint8x8_t
vrev16_u8(uint8x8_t vec
); // VREV16.8 d0,d0
1984 poly8x8_t
vrev16_p8(poly8x8_t vec
); // VREV16.8 d0,d0
1985 int8x16_t
vrev16q_s8(int8x16_t vec
); // VREV16.8 q0,q0
1986 uint8x16_t
vrev16q_u8(uint8x16_t vec
); // VREV16.8 q0,q0
1987 poly8x16_t
vrev16q_p8(poly8x16_t vec
); // VREV16.8 q0,q0
1988 //Other single operand arithmetic
1989 //Absolute: Vd[i] = |Va[i]|
1990 int8x8_t
vabs_s8(int8x8_t a
); // VABS.S8 d0,d0
1991 int16x4_t
vabs_s16(int16x4_t a
); // VABS.S16 d0,d0
1992 int32x2_t
vabs_s32(int32x2_t a
); // VABS.S32 d0,d0
1993 float32x2_t
vabs_f32(float32x2_t a
); // VABS.F32 d0,d0
1994 int8x16_t
vabsq_s8(int8x16_t a
); // VABS.S8 q0,q0
1995 int16x8_t
vabsq_s16(int16x8_t a
); // VABS.S16 q0,q0
1996 int32x4_t
vabsq_s32(int32x4_t a
); // VABS.S32 q0,q0
1997 float32x4_t
vabsq_f32(float32x4_t a
); // VABS.F32 q0,q0
1998 //Saturating absolute: Vd[i] = sat(|Va[i]|)
1999 int8x8_t
vqabs_s8(int8x8_t a
); // VQABS.S8 d0,d0
2000 int16x4_t
vqabs_s16(int16x4_t a
); // VQABS.S16 d0,d0
2001 int32x2_t
vqabs_s32(int32x2_t a
); // VQABS.S32 d0,d0
2002 int8x16_t
vqabsq_s8(int8x16_t a
); // VQABS.S8 q0,q0
2003 int16x8_t
vqabsq_s16(int16x8_t a
); // VQABS.S16 q0,q0
2004 int32x4_t
vqabsq_s32(int32x4_t a
); // VQABS.S32 q0,q0
2005 //Negate: Vd[i] = - Va[i]
2006 int8x8_t
vneg_s8(int8x8_t a
); // VNE//d0,d0
2007 int16x4_t
vneg_s16(int16x4_t a
); // VNE//d0,d0
2008 int32x2_t
vneg_s32(int32x2_t a
); // VNE//d0,d0
2009 float32x2_t
vneg_f32(float32x2_t a
); // VNE//d0,d0
2010 int8x16_t
vnegq_s8(int8x16_t a
); // VNE//q0,q0
2011 int16x8_t
vnegq_s16(int16x8_t a
); // VNE//q0,q0
2012 int32x4_t
vnegq_s32(int32x4_t a
); // VNE//q0,q0
2013 float32x4_t
vnegq_f32(float32x4_t a
); // VNE//q0,q0
2014 //Saturating Negate: sat(Vd[i] = - Va[i])
2015 int8x8_t
vqneg_s8(int8x8_t a
); // VQNE//d0,d0
2016 int16x4_t
vqneg_s16(int16x4_t a
); // VQNE//d0,d0
2017 int32x2_t
vqneg_s32(int32x2_t a
); // VQNE//d0,d0
2018 int8x16_t
vqnegq_s8(int8x16_t a
); // VQNE//q0,q0
2019 int16x8_t
vqnegq_s16(int16x8_t a
); // VQNE//q0,q0
2020 int32x4_t
vqnegq_s32(int32x4_t a
); // VQNE//q0,q0
2021 //Count leading sign bits
2022 int8x8_t
vcls_s8(int8x8_t a
); // VCLS.S8 d0,d0
2023 int16x4_t
vcls_s16(int16x4_t a
); // VCLS.S16 d0,d0
2024 int32x2_t
vcls_s32(int32x2_t a
); // VCLS.S32 d0,d0
2025 int8x16_t
vclsq_s8(int8x16_t a
); // VCLS.S8 q0,q0
2026 int16x8_t
vclsq_s16(int16x8_t a
); // VCLS.S16 q0,q0
2027 int32x4_t
vclsq_s32(int32x4_t a
); // VCLS.S32 q0,q0
2028 //Count leading zeros
2029 int8x8_t
vclz_s8(int8x8_t a
); // VCLZ.I8 d0,d0
2030 int16x4_t
vclz_s16(int16x4_t a
); // VCLZ.I16 d0,d0
2031 int32x2_t
vclz_s32(int32x2_t a
); // VCLZ.I32 d0,d0
2032 uint8x8_t
vclz_u8(uint8x8_t a
); // VCLZ.I8 d0,d0
2033 uint16x4_t
vclz_u16(uint16x4_t a
); // VCLZ.I16 d0,d0
2034 uint32x2_t
vclz_u32(uint32x2_t a
); // VCLZ.I32 d0,d0
2035 int8x16_t
vclzq_s8(int8x16_t a
); // VCLZ.I8 q0,q0
2036 int16x8_t
vclzq_s16(int16x8_t a
); // VCLZ.I16 q0,q0
2037 int32x4_t
vclzq_s32(int32x4_t a
); // VCLZ.I32 q0,q0
2038 uint8x16_t
vclzq_u8(uint8x16_t a
); // VCLZ.I8 q0,q0
2039 uint16x8_t
vclzq_u16(uint16x8_t a
); // VCLZ.I16 q0,q0
2040 uint32x4_t
vclzq_u32(uint32x4_t a
); // VCLZ.I32 q0,q0
2041 //Count number of set bits
2042 uint8x8_t
vcnt_u8(uint8x8_t a
); // VCNT.8 d0,d0
2043 int8x8_t
vcnt_s8(int8x8_t a
); // VCNT.8 d0,d0
2044 poly8x8_t
vcnt_p8(poly8x8_t a
); // VCNT.8 d0,d0
2045 uint8x16_t
vcntq_u8(uint8x16_t a
); // VCNT.8 q0,q0
2046 int8x16_t
vcntq_s8(int8x16_t a
); // VCNT.8 q0,q0
2047 poly8x16_t
vcntq_p8(poly8x16_t a
); // VCNT.8 q0,q0
2048 //Reciprocal estimate
2049 float32x2_t
vrecpe_f32(float32x2_t a
); // VRECPE.F32 d0,d0
2050 uint32x2_t
vrecpe_u32(uint32x2_t a
); // VRECPE.U32 d0,d0
2051 float32x4_t
vrecpeq_f32(float32x4_t a
); // VRECPE.F32 q0,q0
2052 uint32x4_t
vrecpeq_u32(uint32x4_t a
); // VRECPE.U32 q0,q0
2053 //Reciprocal square root estimate
2054 float32x2_t
vrsqrte_f32(float32x2_t a
); // VRSQRTE.F32 d0,d0
2055 uint32x2_t
vrsqrte_u32(uint32x2_t a
); // VRSQRTE.U32 d0,d0
2056 float32x4_t
vrsqrteq_f32(float32x4_t a
); // VRSQRTE.F32 q0,q0
2057 uint32x4_t
vrsqrteq_u32(uint32x4_t a
); // VRSQRTE.U32 q0,q0
2058 //Logical operations
2060 int8x8_t
vmvn_s8(int8x8_t a
); // VMVN d0,d0
2061 int16x4_t
vmvn_s16(int16x4_t a
); // VMVN d0,d0
2062 int32x2_t
vmvn_s32(int32x2_t a
); // VMVN d0,d0
2063 uint8x8_t
vmvn_u8(uint8x8_t a
); // VMVN d0,d0
2064 uint16x4_t
vmvn_u16(uint16x4_t a
); // VMVN d0,d0
2065 uint32x2_t
vmvn_u32(uint32x2_t a
); // VMVN d0,d0
2066 poly8x8_t
vmvn_p8(poly8x8_t a
); // VMVN d0,d0
2067 int8x16_t
vmvnq_s8(int8x16_t a
); // VMVN q0,q0
2068 int16x8_t
vmvnq_s16(int16x8_t a
); // VMVN q0,q0
2069 int32x4_t
vmvnq_s32(int32x4_t a
); // VMVN q0,q0
2070 uint8x16_t
vmvnq_u8(uint8x16_t a
); // VMVN q0,q0
2071 uint16x8_t
vmvnq_u16(uint16x8_t a
); // VMVN q0,q0
2072 uint32x4_t
vmvnq_u32(uint32x4_t a
); // VMVN q0,q0
2073 poly8x16_t
vmvnq_p8(poly8x16_t a
); // VMVN q0,q0
2075 int8x8_t
vand_s8(int8x8_t a
, int8x8_t b
); // VAND d0,d0,d0
2076 int16x4_t
vand_s16(int16x4_t a
, int16x4_t b
); // VAND d0,d0,d0
2077 int32x2_t
vand_s32(int32x2_t a
, int32x2_t b
); // VAND d0,d0,d0
2078 int64x1_t
vand_s64(int64x1_t a
, int64x1_t b
); // VAND d0,d0,d0
2079 uint8x8_t
vand_u8(uint8x8_t a
, uint8x8_t b
); // VAND d0,d0,d0
2080 uint16x4_t
vand_u16(uint16x4_t a
, uint16x4_t b
); // VAND d0,d0,d0
2081 uint32x2_t
vand_u32(uint32x2_t a
, uint32x2_t b
); // VAND d0,d0,d0
2082 uint64x1_t
vand_u64(uint64x1_t a
, uint64x1_t b
); // VAND d0,d0,d0
2083 int8x16_t
vandq_s8(int8x16_t a
, int8x16_t b
); // VAND q0,q0,q0
2084 int16x8_t
vandq_s16(int16x8_t a
, int16x8_t b
); // VAND q0,q0,q0
2085 int32x4_t
vandq_s32(int32x4_t a
, int32x4_t b
); // VAND q0,q0,q0
2086 int64x2_t
vandq_s64(int64x2_t a
, int64x2_t b
); // VAND q0,q0,q0
2087 uint8x16_t
vandq_u8(uint8x16_t a
, uint8x16_t b
); // VAND q0,q0,q0
2088 uint16x8_t
vandq_u16(uint16x8_t a
, uint16x8_t b
); // VAND q0,q0,q0
2089 uint32x4_t
vandq_u32(uint32x4_t a
, uint32x4_t b
); // VAND q0,q0,q0
2090 uint64x2_t
vandq_u64(uint64x2_t a
, uint64x2_t b
); // VAND q0,q0,q0
2092 int8x8_t
vorr_s8(int8x8_t a
, int8x8_t b
); // VORR d0,d0,d0
2093 int16x4_t
vorr_s16(int16x4_t a
, int16x4_t b
); // VORR d0,d0,d0
2094 int32x2_t
vorr_s32(int32x2_t a
, int32x2_t b
); // VORR d0,d0,d0
2095 int64x1_t
vorr_s64(int64x1_t a
, int64x1_t b
); // VORR d0,d0,d0
2096 uint8x8_t
vorr_u8(uint8x8_t a
, uint8x8_t b
); // VORR d0,d0,d0
2097 uint16x4_t
vorr_u16(uint16x4_t a
, uint16x4_t b
); // VORR d0,d0,d0
2098 uint32x2_t
vorr_u32(uint32x2_t a
, uint32x2_t b
); // VORR d0,d0,d0
2099 uint64x1_t
vorr_u64(uint64x1_t a
, uint64x1_t b
); // VORR d0,d0,d0
2100 int8x16_t
vorrq_s8(int8x16_t a
, int8x16_t b
); // VORR q0,q0,q0
2101 int16x8_t
vorrq_s16(int16x8_t a
, int16x8_t b
); // VORR q0,q0,q0
2102 int32x4_t
vorrq_s32(int32x4_t a
, int32x4_t b
); // VORR q0,q0,q0
2103 int64x2_t
vorrq_s64(int64x2_t a
, int64x2_t b
); // VORR q0,q0,q0
2104 uint8x16_t
vorrq_u8(uint8x16_t a
, uint8x16_t b
); // VORR q0,q0,q0
2105 uint16x8_t
vorrq_u16(uint16x8_t a
, uint16x8_t b
); // VORR q0,q0,q0
2106 uint32x4_t
vorrq_u32(uint32x4_t a
, uint32x4_t b
); // VORR q0,q0,q0
2107 uint64x2_t
vorrq_u64(uint64x2_t a
, uint64x2_t b
); // VORR q0,q0,q0
2108 //Bitwise exclusive or (EOR or XOR)
2109 int8x8_t
veor_s8(int8x8_t a
, int8x8_t b
); // VEOR d0,d0,d0
2110 int16x4_t
veor_s16(int16x4_t a
, int16x4_t b
); // VEOR d0,d0,d0
2111 int32x2_t
veor_s32(int32x2_t a
, int32x2_t b
); // VEOR d0,d0,d0
2112 int64x1_t
veor_s64(int64x1_t a
, int64x1_t b
); // VEOR d0,d0,d0
2113 uint8x8_t
veor_u8(uint8x8_t a
, uint8x8_t b
); // VEOR d0,d0,d0
2114 uint16x4_t
veor_u16(uint16x4_t a
, uint16x4_t b
); // VEOR d0,d0,d0
2115 uint32x2_t
veor_u32(uint32x2_t a
, uint32x2_t b
); // VEOR d0,d0,d0
2116 uint64x1_t
veor_u64(uint64x1_t a
, uint64x1_t b
); // VEOR d0,d0,d0
2117 int8x16_t
veorq_s8(int8x16_t a
, int8x16_t b
); // VEOR q0,q0,q0
2118 int16x8_t
veorq_s16(int16x8_t a
, int16x8_t b
); // VEOR q0,q0,q0
2119 int32x4_t
veorq_s32(int32x4_t a
, int32x4_t b
); // VEOR q0,q0,q0
2120 int64x2_t
veorq_s64(int64x2_t a
, int64x2_t b
); // VEOR q0,q0,q0
2121 uint8x16_t
veorq_u8(uint8x16_t a
, uint8x16_t b
); // VEOR q0,q0,q0
2122 uint16x8_t
veorq_u16(uint16x8_t a
, uint16x8_t b
); // VEOR q0,q0,q0
2123 uint32x4_t
veorq_u32(uint32x4_t a
, uint32x4_t b
); // VEOR q0,q0,q0
2124 uint64x2_t
veorq_u64(uint64x2_t a
, uint64x2_t b
); // VEOR q0,q0,q0
2126 int8x8_t
vbic_s8(int8x8_t a
, int8x8_t b
); // VBIC d0,d0,d0
2127 int16x4_t
vbic_s16(int16x4_t a
, int16x4_t b
); // VBIC d0,d0,d0
2128 int32x2_t
vbic_s32(int32x2_t a
, int32x2_t b
); // VBIC d0,d0,d0
2129 int64x1_t
vbic_s64(int64x1_t a
, int64x1_t b
); // VBIC d0,d0,d0
2130 uint8x8_t
vbic_u8(uint8x8_t a
, uint8x8_t b
); // VBIC d0,d0,d0
2131 uint16x4_t
vbic_u16(uint16x4_t a
, uint16x4_t b
); // VBIC d0,d0,d0
2132 uint32x2_t
vbic_u32(uint32x2_t a
, uint32x2_t b
); // VBIC d0,d0,d0
2133 uint64x1_t
vbic_u64(uint64x1_t a
, uint64x1_t b
); // VBIC d0,d0,d0
2134 int8x16_t
vbicq_s8(int8x16_t a
, int8x16_t b
); // VBIC q0,q0,q0
2135 int16x8_t
vbicq_s16(int16x8_t a
, int16x8_t b
); // VBIC q0,q0,q0
2136 int32x4_t
vbicq_s32(int32x4_t a
, int32x4_t b
); // VBIC q0,q0,q0
2137 int64x2_t
vbicq_s64(int64x2_t a
, int64x2_t b
); // VBIC q0,q0,q0
2138 uint8x16_t
vbicq_u8(uint8x16_t a
, uint8x16_t b
); // VBIC q0,q0,q0
2139 uint16x8_t
vbicq_u16(uint16x8_t a
, uint16x8_t b
); // VBIC q0,q0,q0
2140 uint32x4_t
vbicq_u32(uint32x4_t a
, uint32x4_t b
); // VBIC q0,q0,q0
2141 uint64x2_t
vbicq_u64(uint64x2_t a
, uint64x2_t b
); // VBIC q0,q0,q0
2142 //Bitwise OR complement
2143 int8x8_t
vorn_s8(int8x8_t a
, int8x8_t b
); // VORN d0,d0,d0
2144 int16x4_t
vorn_s16(int16x4_t a
, int16x4_t b
); // VORN d0,d0,d0
2145 int32x2_t
vorn_s32(int32x2_t a
, int32x2_t b
); // VORN d0,d0,d0
2146 int64x1_t
vorn_s64(int64x1_t a
, int64x1_t b
); // VORN d0,d0,d0
2147 uint8x8_t
vorn_u8(uint8x8_t a
, uint8x8_t b
); // VORN d0,d0,d0
2148 uint16x4_t
vorn_u16(uint16x4_t a
, uint16x4_t b
); // VORN d0,d0,d0
2149 uint32x2_t
vorn_u32(uint32x2_t a
, uint32x2_t b
); // VORN d0,d0,d0
2150 uint64x1_t
vorn_u64(uint64x1_t a
, uint64x1_t b
); // VORN d0,d0,d0
2151 int8x16_t
vornq_s8(int8x16_t a
, int8x16_t b
); // VORN q0,q0,q0
2152 int16x8_t
vornq_s16(int16x8_t a
, int16x8_t b
); // VORN q0,q0,q0
2153 int32x4_t
vornq_s32(int32x4_t a
, int32x4_t b
); // VORN q0,q0,q0
2154 int64x2_t
vornq_s64(int64x2_t a
, int64x2_t b
); // VORN q0,q0,q0
2155 uint8x16_t
vornq_u8(uint8x16_t a
, uint8x16_t b
); // VORN q0,q0,q0
2156 uint16x8_t
vornq_u16(uint16x8_t a
, uint16x8_t b
); // VORN q0,q0,q0
2157 uint32x4_t
vornq_u32(uint32x4_t a
, uint32x4_t b
); // VORN q0,q0,q0
2158 uint64x2_t
vornq_u64(uint64x2_t a
, uint64x2_t b
); // VORN q0,q0,q0
2160 int8x8_t
vbsl_s8(uint8x8_t a
, int8x8_t b
, int8x8_t c
); // VBSL d0,d0,d0
2161 int16x4_t
vbsl_s16(uint16x4_t a
, int16x4_t b
, int16x4_t c
); // VBSL d0,d0,d0
2162 int32x2_t
vbsl_s32(uint32x2_t a
, int32x2_t b
, int32x2_t c
); // VBSL d0,d0,d0
2163 int64x1_t
vbsl_s64(uint64x1_t a
, int64x1_t b
, int64x1_t c
); // VBSL d0,d0,d0
2164 uint8x8_t
vbsl_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VBSL d0,d0,d0
2165 uint16x4_t
vbsl_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VBSL d0,d0,d0
2166 uint32x2_t
vbsl_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VBSL d0,d0,d0
2167 uint64x1_t
vbsl_u64(uint64x1_t a
, uint64x1_t b
, uint64x1_t c
); // VBSL d0,d0,d0
2168 float32x2_t
vbsl_f32(uint32x2_t a
, float32x2_t b
, float32x2_t c
); // VBSL d0,d0,d0
2169 poly8x8_t
vbsl_p8(uint8x8_t a
, poly8x8_t b
, poly8x8_t c
); // VBSL d0,d0,d0
2170 poly16x4_t
vbsl_p16(uint16x4_t a
, poly16x4_t b
, poly16x4_t c
); // VBSL d0,d0,d0
2171 int8x16_t
vbslq_s8(uint8x16_t a
, int8x16_t b
, int8x16_t c
); // VBSL q0,q0,q0
2172 int16x8_t
vbslq_s16(uint16x8_t a
, int16x8_t b
, int16x8_t c
); // VBSL q0,q0,q0
2173 int32x4_t
vbslq_s32(uint32x4_t a
, int32x4_t b
, int32x4_t c
); // VBSL q0,q0,q0
2174 int64x2_t
vbslq_s64(uint64x2_t a
, int64x2_t b
, int64x2_t c
); // VBSL q0,q0,q0
2175 uint8x16_t
vbslq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VBSL q0,q0,q0
2176 uint16x8_t
vbslq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VBSL q0,q0,q0
2177 uint32x4_t
vbslq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VBSL q0,q0,q0
2178 uint64x2_t
vbslq_u64(uint64x2_t a
, uint64x2_t b
, uint64x2_t c
); // VBSL q0,q0,q0
2179 float32x4_t
vbslq_f32(uint32x4_t a
, float32x4_t b
, float32x4_t c
); // VBSL q0,q0,q0
2180 poly8x16_t
vbslq_p8(uint8x16_t a
, poly8x16_t b
, poly8x16_t c
); // VBSL q0,q0,q0
2181 poly16x8_t
vbslq_p16(uint16x8_t a
, poly16x8_t b
, poly16x8_t c
); // VBSL q0,q0,q0
2182 //Transposition operations
2183 //Transpose elements
2184 int8x8x2_t
vtrn_s8(int8x8_t a
, int8x8_t b
); // VTRN.8 d0,d0
2185 int16x4x2_t
vtrn_s16(int16x4_t a
, int16x4_t b
); // VTRN.16 d0,d0
2186 int32x2x2_t
vtrn_s32(int32x2_t a
, int32x2_t b
); // VTRN.32 d0,d0
2187 uint8x8x2_t
vtrn_u8(uint8x8_t a
, uint8x8_t b
); // VTRN.8 d0,d0
2188 uint16x4x2_t
vtrn_u16(uint16x4_t a
, uint16x4_t b
); // VTRN.16 d0,d0
2189 uint32x2x2_t
vtrn_u32(uint32x2_t a
, uint32x2_t b
); // VTRN.32 d0,d0
2190 float32x2x2_t
vtrn_f32(float32x2_t a
, float32x2_t b
); // VTRN.32 d0,d0
2191 poly8x8x2_t
vtrn_p8(poly8x8_t a
, poly8x8_t b
); // VTRN.8 d0,d0
2192 poly16x4x2_t
vtrn_p16(poly16x4_t a
, poly16x4_t b
); // VTRN.16 d0,d0
2193 int8x16x2_t
vtrnq_s8(int8x16_t a
, int8x16_t b
); // VTRN.8 q0,q0
2194 int16x8x2_t
vtrnq_s16(int16x8_t a
, int16x8_t b
); // VTRN.16 q0,q0
2195 int32x4x2_t
vtrnq_s32(int32x4_t a
, int32x4_t b
); // VTRN.32 q0,q0
2196 uint8x16x2_t
vtrnq_u8(uint8x16_t a
, uint8x16_t b
); // VTRN.8 q0,q0
2197 uint16x8x2_t
vtrnq_u16(uint16x8_t a
, uint16x8_t b
); // VTRN.16 q0,q0
2198 uint32x4x2_t
vtrnq_u32(uint32x4_t a
, uint32x4_t b
); // VTRN.32 q0,q0
2199 float32x4x2_t
vtrnq_f32(float32x4_t a
, float32x4_t b
); // VTRN.32 q0,q0
2200 poly8x16x2_t
vtrnq_p8(poly8x16_t a
, poly8x16_t b
); // VTRN.8 q0,q0
2201 poly16x8x2_t
vtrnq_p16(poly16x8_t a
, poly16x8_t b
); // VTRN.16 q0,q0
2202 //Interleave elements
2203 int8x8x2_t
vzip_s8(int8x8_t a
, int8x8_t b
); // VZIP.8 d0,d0
2204 int16x4x2_t
vzip_s16(int16x4_t a
, int16x4_t b
); // VZIP.16 d0,d0
2205 int32x2x2_t
vzip_s32(int32x2_t a
, int32x2_t b
); // VZIP.32 d0,d0
2206 uint8x8x2_t
vzip_u8(uint8x8_t a
, uint8x8_t b
); // VZIP.8 d0,d0
2207 uint16x4x2_t
vzip_u16(uint16x4_t a
, uint16x4_t b
); // VZIP.16 d0,d0
2208 uint32x2x2_t
vzip_u32(uint32x2_t a
, uint32x2_t b
); // VZIP.32 d0,d0
2209 float32x2x2_t
vzip_f32(float32x2_t a
, float32x2_t b
); // VZIP.32 d0,d0
2210 poly8x8x2_t
vzip_p8(poly8x8_t a
, poly8x8_t b
); // VZIP.8 d0,d0
2211 poly16x4x2_t
vzip_p16(poly16x4_t a
, poly16x4_t b
); // VZIP.16 d0,d0
2212 int8x16x2_t
vzipq_s8(int8x16_t a
, int8x16_t b
); // VZIP.8 q0,q0
2213 int16x8x2_t
vzipq_s16(int16x8_t a
, int16x8_t b
); // VZIP.16 q0,q0
2214 int32x4x2_t
vzipq_s32(int32x4_t a
, int32x4_t b
); // VZIP.32 q0,q0
2215 uint8x16x2_t
vzipq_u8(uint8x16_t a
, uint8x16_t b
); // VZIP.8 q0,q0
2216 uint16x8x2_t
vzipq_u16(uint16x8_t a
, uint16x8_t b
); // VZIP.16 q0,q0
2217 uint32x4x2_t
vzipq_u32(uint32x4_t a
, uint32x4_t b
); // VZIP.32 q0,q0
2218 float32x4x2_t
vzipq_f32(float32x4_t a
, float32x4_t b
); // VZIP.32 q0,q0
2219 poly8x16x2_t
vzipq_p8(poly8x16_t a
, poly8x16_t b
); // VZIP.8 q0,q0
2220 poly16x8x2_t
vzipq_p16(poly16x8_t a
, poly16x8_t b
); // VZIP.16 q0,q0
2221 //De-Interleave elements
2222 int8x8x2_t
vuzp_s8(int8x8_t a
, int8x8_t b
); // VUZP.8 d0,d0
2223 int16x4x2_t
vuzp_s16(int16x4_t a
, int16x4_t b
); // VUZP.16 d0,d0
2224 int32x2x2_t
vuzp_s32(int32x2_t a
, int32x2_t b
); // VUZP.32 d0,d0
2225 uint8x8x2_t
vuzp_u8(uint8x8_t a
, uint8x8_t b
); // VUZP.8 d0,d0
2226 uint16x4x2_t
vuzp_u16(uint16x4_t a
, uint16x4_t b
); // VUZP.16 d0,d0
2227 uint32x2x2_t
vuzp_u32(uint32x2_t a
, uint32x2_t b
); // VUZP.32 d0,d0
2228 float32x2x2_t
vuzp_f32(float32x2_t a
, float32x2_t b
); // VUZP.32 d0,d0
2229 poly8x8x2_t
vuzp_p8(poly8x8_t a
, poly8x8_t b
); // VUZP.8 d0,d0
2230 poly16x4x2_t
vuzp_p16(poly16x4_t a
, poly16x4_t b
); // VUZP.16 d0,d0
2231 int8x16x2_t
vuzpq_s8(int8x16_t a
, int8x16_t b
); // VUZP.8 q0,q0
2232 int16x8x2_t
vuzpq_s16(int16x8_t a
, int16x8_t b
); // VUZP.16 q0,q0
2233 int32x4x2_t
vuzpq_s32(int32x4_t a
, int32x4_t b
); // VUZP.32 q0,q0
2234 uint8x16x2_t
vuzpq_u8(uint8x16_t a
, uint8x16_t b
); // VUZP.8 q0,q0
2235 uint16x8x2_t
vuzpq_u16(uint16x8_t a
, uint16x8_t b
); // VUZP.16 q0,q0
2236 uint32x4x2_t
vuzpq_u32(uint32x4_t a
, uint32x4_t b
); // VUZP.32 q0,q0
2237 float32x4x2_t
vuzpq_f32(float32x4_t a
, float32x4_t b
); // VUZP.32 q0,q0
2238 poly8x16x2_t
vuzpq_p8(poly8x16_t a
, poly8x16_t b
); // VUZP.8 q0,q0
2239 poly16x8x2_t
vuzpq_p16(poly16x8_t a
, poly16x8_t b
); // VUZP.16 q0,q0
2242 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2243 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
2244 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
2246 #if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
2248 #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
2250 #define _MM_EXTRACT_EPI16 _mm_extract_epi16
2251 #define _MM_INSERT_EPI16 _mm_insert_epi16
2253 #define _MM_EXTRACT_EPI8 _mm_extract_epi8
2254 #define _MM_EXTRACT_EPI32 _mm_extract_epi32
2255 #define _MM_EXTRACT_PS _mm_extract_ps
2257 #define _MM_INSERT_EPI8 _mm_insert_epi8
2258 #define _MM_INSERT_EPI32 _mm_insert_epi32
2259 #define _MM_INSERT_PS _mm_insert_ps
2260 #ifdef _NEON2SSE_64BIT
2261 #define _MM_INSERT_EPI64 _mm_insert_epi64
2262 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
2266 #define _NEON2SSE_COMMA ,
2267 #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
2270 case 0: return NAME(a b, 0); \
2271 case 1: return NAME(a b, 1); \
2272 case 2: return NAME(a b, 2); \
2273 case 3: return NAME(a b, 3); \
2274 case 4: return NAME(a b, 4); \
2275 case 5: return NAME(a b, 5); \
2276 case 6: return NAME(a b, 6); \
2277 case 7: return NAME(a b, 7); \
2278 case 8: return NAME(a b, 8); \
2279 case 9: return NAME(a b, 9); \
2280 case 10: return NAME(a b, 10); \
2281 case 11: return NAME(a b, 11); \
2282 case 12: return NAME(a b, 12); \
2283 case 13: return NAME(a b, 13); \
2284 case 14: return NAME(a b, 14); \
2285 case 15: return NAME(a b, 15); \
2286 default: return NAME(a b, 0); \
2289 #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
2292 case 0: return NAME(vec p,0); \
2293 case 1: return NAME(vec p,1); \
2294 case 2: return NAME(vec p,2); \
2295 case 3: return NAME(vec p,3); \
2296 case 4: return NAME(vec p,4); \
2297 case 5: return NAME(vec p,5); \
2298 case 6: return NAME(vec p,6); \
2299 case 7: return NAME(vec p,7); \
2300 default: return NAME(vec p,0); \
2303 #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
2306 case case0: return NAME(vec p,case0); \
2307 case case1: return NAME(vec p,case1); \
2308 case case2: return NAME(vec p,case2); \
2309 case case3: return NAME(vec p,case3); \
2310 default: return NAME(vec p,case0); \
2313 _NEON2SSE_INLINE __m128i
_MM_ALIGNR_EPI8(__m128i a
, __m128i b
, int LANE
)
2315 _NEON2SSE_SWITCH16(_mm_alignr_epi8
, a
, _NEON2SSE_COMMA b
, LANE
)
2318 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI16(__m128i vec
, int p
, const int LANE
)
2320 _NEON2SSE_SWITCH8(_mm_insert_epi16
, vec
, LANE
, _NEON2SSE_COMMA p
)
2323 _NEON2SSE_INLINE
int _MM_EXTRACT_EPI16(__m128i vec
, const int LANE
)
2325 _NEON2SSE_SWITCH8(_mm_extract_epi16
, vec
, LANE
,)
2329 _NEON2SSE_INLINE
int _MM_EXTRACT_EPI32(__m128i vec
, const int LANE
)
2331 _NEON2SSE_SWITCH4(_mm_extract_epi32
, 0,1,2,3, vec
, LANE
,)
2334 _NEON2SSE_INLINE
int _MM_EXTRACT_PS(__m128 vec
, const int LANE
)
2336 _NEON2SSE_SWITCH4(_mm_extract_ps
, 0,1,2,3, vec
, LANE
,)
2339 _NEON2SSE_INLINE
int _MM_EXTRACT_EPI8(__m128i vec
, const int LANE
)
2341 _NEON2SSE_SWITCH16(_mm_extract_epi8
, vec
, , LANE
)
2344 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI32(__m128i vec
, int p
, const int LANE
)
2346 _NEON2SSE_SWITCH4(_mm_insert_epi32
, 0, 1, 2, 3, vec
, LANE
, _NEON2SSE_COMMA p
)
2349 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI8(__m128i vec
, int p
, const int LANE
)
2351 _NEON2SSE_SWITCH16(_mm_insert_epi8
, vec
, _NEON2SSE_COMMA p
, LANE
)
2354 #ifdef _NEON2SSE_64BIT
2355 //the special case of functions available only for SSE4 and 64-bit build.
2356 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI64(__m128i vec
, int p
, const int LANE
)
2360 return _mm_insert_epi64(vec
, p
, 0);
2362 return _mm_insert_epi64(vec
, p
, 1);
2364 return _mm_insert_epi64(vec
, p
, 0);
2368 _NEON2SSE_INLINE
int64_t _MM_EXTRACT_EPI64(__m128i val
, const int LANE
)
2370 if (LANE
==0) return _mm_extract_epi64(val
, 0);
2371 else return _mm_extract_epi64(val
, 1);
2375 _NEON2SSE_INLINE __m128
_MM_INSERT_PS(__m128 vec
, __m128 p
, const int LANE
)
2377 _NEON2SSE_SWITCH4(_mm_insert_ps
, 0, 16, 32, 48, vec
, LANE
, _NEON2SSE_COMMA p
)
2382 #endif //#ifdef NDEBUG
2384 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2385 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
2386 // or for some specific commonly used operations implementation missing in SSE
2388 #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16
2389 #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
2390 #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64
2392 #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16
2393 #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
2394 #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64
2396 #define _MM_MAX_EPI8 _mm_max_epi8
2397 #define _MM_MAX_EPI32 _mm_max_epi32
2398 #define _MM_MAX_EPU16 _mm_max_epu16
2399 #define _MM_MAX_EPU32 _mm_max_epu32
2401 #define _MM_MIN_EPI8 _mm_min_epi8
2402 #define _MM_MIN_EPI32 _mm_min_epi32
2403 #define _MM_MIN_EPU16 _mm_min_epu16
2404 #define _MM_MIN_EPU32 _mm_min_epu32
2406 #define _MM_BLENDV_EPI8 _mm_blendv_epi8
2407 #define _MM_PACKUS_EPI32 _mm_packus_epi32
2408 #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
2410 #define _MM_MULLO_EPI32 _mm_mullo_epi32
2411 #define _MM_MUL_EPI32 _mm_mul_epi32
2413 #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
2414 #else //no SSE4 !!!!!!
2415 _NEON2SSE_INLINE __m128i
_MM_CVTEPU8_EPI16(__m128i a
)
2417 __m128i zero
= _mm_setzero_si128();
2418 return _mm_unpacklo_epi8(a
, zero
);
2421 _NEON2SSE_INLINE __m128i
_MM_CVTEPU16_EPI32(__m128i a
)
2423 __m128i zero
= _mm_setzero_si128();
2424 return _mm_unpacklo_epi16(a
, zero
);
2427 _NEON2SSE_INLINE __m128i
_MM_CVTEPU32_EPI64(__m128i a
)
2429 __m128i zero
= _mm_setzero_si128();
2430 return _mm_unpacklo_epi32(a
, zero
);
2433 _NEON2SSE_INLINE __m128i
_MM_CVTEPI8_EPI16(__m128i a
)
2435 __m128i zero
= _mm_setzero_si128();
2436 __m128i sign
= _mm_cmpgt_epi8(zero
, a
);
2437 return _mm_unpacklo_epi8(a
, sign
);
2440 _NEON2SSE_INLINE __m128i
_MM_CVTEPI16_EPI32(__m128i a
)
2442 __m128i zero
= _mm_setzero_si128();
2443 __m128i sign
= _mm_cmpgt_epi16(zero
, a
);
2444 return _mm_unpacklo_epi16(a
, sign
);
2447 _NEON2SSE_INLINE __m128i
_MM_CVTEPI32_EPI64(__m128i a
)
2449 __m128i zero
= _mm_setzero_si128();
2450 __m128i sign
= _mm_cmpgt_epi32(zero
, a
);
2451 return _mm_unpacklo_epi32(a
, sign
);
2454 _NEON2SSE_INLINE
int _MM_EXTRACT_EPI32(__m128i vec
, const int LANE
)
2456 _NEON2SSE_ALIGN_16
int32_t tmp
[4];
2457 _mm_store_si128((__m128i
*)tmp
, vec
);
2461 _NEON2SSE_INLINE
int _MM_EXTRACT_EPI8(__m128i vec
, const int LANE
)
2463 _NEON2SSE_ALIGN_16
int8_t tmp
[16];
2464 _mm_store_si128((__m128i
*)tmp
, vec
);
2465 return (int)tmp
[LANE
];
2468 _NEON2SSE_INLINE
int _MM_EXTRACT_PS(__m128 vec
, const int LANE
)
2470 _NEON2SSE_ALIGN_16
int32_t tmp
[4];
2471 _mm_store_si128((__m128i
*)tmp
, _M128i(vec
));
2475 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI32(__m128i vec
, int p
, const int LANE
)
2477 _NEON2SSE_ALIGN_16
int32_t pvec
[4] = {0,0,0,0};
2478 _NEON2SSE_ALIGN_16
uint32_t mask
[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2479 __m128i vec_masked
, p_masked
;
2482 vec_masked
= _mm_and_si128 (*(__m128i
*)mask
,vec
); //ready for p
2483 p_masked
= _mm_andnot_si128 (*(__m128i
*)mask
,*(__m128i
*)pvec
); //ready for vec
2484 return _mm_or_si128(vec_masked
, p_masked
);
2487 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI8(__m128i vec
, int p
, const int LANE
)
2489 _NEON2SSE_ALIGN_16
int8_t pvec
[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
2490 _NEON2SSE_ALIGN_16
uint8_t mask
[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
2491 __m128i vec_masked
, p_masked
;
2492 pvec
[LANE
] = (int8_t)p
;
2494 vec_masked
= _mm_and_si128 (*(__m128i
*)mask
,vec
); //ready for p
2495 p_masked
= _mm_andnot_si128 (*(__m128i
*)mask
,*(__m128i
*)pvec
); //ready for vec
2496 return _mm_or_si128(vec_masked
, p_masked
);
2499 _NEON2SSE_INLINE __m128
_MM_INSERT_PS(__m128 vec
, __m128 p
, const int LANE
)
2501 _NEON2SSE_ALIGN_16
int32_t mask
[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
2502 __m128 tmp
, vec_masked
, p_masked
;
2503 mask
[LANE
>> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
2504 vec_masked
= _mm_and_ps (*(__m128
*)mask
,vec
); //ready for p
2505 p_masked
= _mm_andnot_ps (*(__m128
*)mask
, p
); //ready for vec
2506 tmp
= _mm_or_ps(vec_masked
, p_masked
);
2510 _NEON2SSE_INLINE __m128i
_MM_MAX_EPI8(__m128i a
, __m128i b
)
2512 __m128i cmp
, resa
, resb
;
2513 cmp
= _mm_cmpgt_epi8 (a
, b
);
2514 resa
= _mm_and_si128 (cmp
, a
);
2515 resb
= _mm_andnot_si128 (cmp
,b
);
2516 return _mm_or_si128(resa
, resb
);
2519 _NEON2SSE_INLINE __m128i
_MM_MAX_EPI32(__m128i a
, __m128i b
)
2521 __m128i cmp
, resa
, resb
;
2522 cmp
= _mm_cmpgt_epi32(a
, b
);
2523 resa
= _mm_and_si128 (cmp
, a
);
2524 resb
= _mm_andnot_si128 (cmp
,b
);
2525 return _mm_or_si128(resa
, resb
);
2528 _NEON2SSE_INLINE __m128i
_MM_MAX_EPU16(__m128i a
, __m128i b
)
2530 __m128i c8000
, b_s
, a_s
, cmp
;
2531 c8000
= _mm_cmpeq_epi16 (a
,a
); //0xffff
2532 c8000
= _mm_slli_epi16 (c8000
, 15); //0x8000
2533 b_s
= _mm_sub_epi16 (b
, c8000
);
2534 a_s
= _mm_sub_epi16 (a
, c8000
);
2535 cmp
= _mm_cmpgt_epi16 (a_s
, b_s
); //no unsigned comparison, need to go to signed
2536 a_s
= _mm_and_si128 (cmp
,a
);
2537 b_s
= _mm_andnot_si128 (cmp
,b
);
2538 return _mm_or_si128(a_s
, b_s
);
2541 _NEON2SSE_INLINE __m128i
_MM_MAX_EPU32(__m128i a
, __m128i b
)
2543 __m128i c80000000
, b_s
, a_s
, cmp
;
2544 c80000000
= _mm_cmpeq_epi32 (a
,a
); //0xffffffff
2545 c80000000
= _mm_slli_epi32 (c80000000
, 31); //0x80000000
2546 b_s
= _mm_sub_epi32 (b
, c80000000
);
2547 a_s
= _mm_sub_epi32 (a
, c80000000
);
2548 cmp
= _mm_cmpgt_epi32 (a_s
, b_s
); //no unsigned comparison, need to go to signed
2549 a_s
= _mm_and_si128 (cmp
,a
);
2550 b_s
= _mm_andnot_si128 (cmp
,b
);
2551 return _mm_or_si128(a_s
, b_s
);
2554 _NEON2SSE_INLINE __m128i
_MM_MIN_EPI8(__m128i a
, __m128i b
)
2556 __m128i cmp
, resa
, resb
;
2557 cmp
= _mm_cmpgt_epi8 (b
, a
);
2558 resa
= _mm_and_si128 (cmp
, a
);
2559 resb
= _mm_andnot_si128 (cmp
,b
);
2560 return _mm_or_si128(resa
, resb
);
2563 _NEON2SSE_INLINE __m128i
_MM_MIN_EPI32(__m128i a
, __m128i b
)
2565 __m128i cmp
, resa
, resb
;
2566 cmp
= _mm_cmpgt_epi32(b
, a
);
2567 resa
= _mm_and_si128 (cmp
, a
);
2568 resb
= _mm_andnot_si128 (cmp
,b
);
2569 return _mm_or_si128(resa
, resb
);
2572 _NEON2SSE_INLINE __m128i
_MM_MIN_EPU16(__m128i a
, __m128i b
)
2574 __m128i c8000
, b_s
, a_s
, cmp
;
2575 c8000
= _mm_cmpeq_epi16 (a
,a
); //0xffff
2576 c8000
= _mm_slli_epi16 (c8000
, 15); //0x8000
2577 b_s
= _mm_sub_epi16 (b
, c8000
);
2578 a_s
= _mm_sub_epi16 (a
, c8000
);
2579 cmp
= _mm_cmpgt_epi16 (b_s
, a_s
); //no unsigned comparison, need to go to signed
2580 a_s
= _mm_and_si128 (cmp
,a
);
2581 b_s
= _mm_andnot_si128 (cmp
,b
);
2582 return _mm_or_si128(a_s
, b_s
);
2585 _NEON2SSE_INLINE __m128i
_MM_MIN_EPU32(__m128i a
, __m128i b
)
2587 __m128i c80000000
, b_s
, a_s
, cmp
;
2588 c80000000
= _mm_cmpeq_epi32 (a
,a
); //0xffffffff
2589 c80000000
= _mm_slli_epi32 (c80000000
, 31); //0x80000000
2590 b_s
= _mm_sub_epi32 (b
, c80000000
);
2591 a_s
= _mm_sub_epi32 (a
, c80000000
);
2592 cmp
= _mm_cmpgt_epi32 (b_s
, a_s
); //no unsigned comparison, need to go to signed
2593 a_s
= _mm_and_si128 (cmp
,a
);
2594 b_s
= _mm_andnot_si128 (cmp
,b
);
2595 return _mm_or_si128(a_s
, b_s
);
2598 _NEON2SSE_INLINE __m128i
_MM_BLENDV_EPI8(__m128i a
, __m128i b
, __m128i mask
) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below
2600 //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
2601 __m128i a_masked
, b_masked
;
2602 b_masked
= _mm_and_si128 (mask
,b
); //use b if mask 0xff
2603 a_masked
= _mm_andnot_si128 (mask
,a
);
2604 return _mm_or_si128(a_masked
, b_masked
);
2607 _NEON2SSE_INLINE __m128i
_MM_PACKUS_EPI32(__m128i a
, __m128i b
)
2609 _NEON2SSE_ALIGN_16
int8_t mask8_32_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
2610 __m128i a16
, b16
, res
, reshi
,cmp
, zero
;
2611 zero
= _mm_setzero_si128();
2612 a16
= _mm_shuffle_epi8 (a
, *(__m128i
*) mask8_32_even_odd
);
2613 b16
= _mm_shuffle_epi8 (b
, *(__m128i
*) mask8_32_even_odd
);
2614 res
= _mm_unpacklo_epi64(a16
, b16
); //result without saturation
2615 reshi
= _mm_unpackhi_epi64(a16
, b16
); //hi part of result used for saturation
2616 cmp
= _mm_cmpgt_epi16(zero
, reshi
); //if cmp<0 the result should be zero
2617 res
= _mm_andnot_si128(cmp
,res
); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2618 cmp
= _mm_cmpgt_epi16(reshi
,zero
); //if cmp positive
2619 return _mm_or_si128(res
, cmp
); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2622 _NEON2SSE_INLINE __m128i
_MM_PACKUS1_EPI32(__m128i a
)
2624 _NEON2SSE_ALIGN_16
int8_t mask8_32_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
2625 __m128i a16
, res
, reshi
,cmp
, zero
;
2626 zero
= _mm_setzero_si128();
2627 a16
= _mm_shuffle_epi8 (a
, *(__m128i
*)mask8_32_even_odd
);
2628 reshi
= _mm_unpackhi_epi64(a16
, a16
); //hi part of result used for saturation
2629 cmp
= _mm_cmpgt_epi16(zero
, reshi
); //if cmp<0 the result should be zero
2630 res
= _mm_andnot_si128(cmp
, a16
); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
2631 cmp
= _mm_cmpgt_epi16(reshi
,zero
); //if cmp positive
2632 return _mm_or_si128(res
, cmp
); //if cmp positive we are out of 16bits need to saturaate to 0xffff
2636 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(__m128i
_MM_MULLO_EPI32(__m128i a
, __m128i b
), _NEON2SSE_REASON_SLOW_SERIAL
)
2638 _NEON2SSE_ALIGN_16
int32_t atmp
[4], btmp
[4], res
[4];
2641 _mm_store_si128((__m128i
*)atmp
, a
);
2642 _mm_store_si128((__m128i
*)btmp
, b
);
2643 for (i
= 0; i
<4; i
++) {
2644 res64
= atmp
[i
] * btmp
[i
];
2645 res
[i
] = (int)(res64
& 0xffffffff);
2647 return _mm_load_si128((__m128i
*)res
);
2650 _NEON2SSE_INLINE __m128i
_MM_MUL_EPI32(__m128i a
, __m128i b
)
2652 __m128i sign
, zero
, mul_us
, a_neg
, b_neg
, mul_us_neg
;
2653 sign
= _mm_xor_si128 (a
, b
);
2654 sign
= _mm_srai_epi32 (sign
, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
2655 zero
= _mm_setzero_si128();
2656 a_neg
= _mm_abs_epi32 (a
); //negate a and b
2657 b_neg
= _mm_abs_epi32 (b
); //negate a and b
2658 mul_us
= _mm_mul_epu32 (a_neg
, b_neg
); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
2659 mul_us_neg
= _mm_sub_epi64(zero
, mul_us
);
2660 mul_us_neg
= _mm_and_si128(sign
, mul_us_neg
);
2661 mul_us
= _mm_andnot_si128(sign
, mul_us
);
2662 return _mm_or_si128 (mul_us
, mul_us_neg
);
2665 _NEON2SSE_INLINE __m128i
_MM_CMPEQ_EPI64(__m128i a
, __m128i b
)
2668 res
= _mm_cmpeq_epi32 (a
, b
);
2669 return _mm_shuffle_epi32 (res
, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
2673 //the special case of functions working only for 32 bits, no SSE4
2674 _NEON2SSE_INLINE __m128i
_MM_INSERT_EPI64_32(__m128i vec
, int p
, const int LANE
)
2676 _NEON2SSE_ALIGN_16
uint64_t pvec
[2] = {0,0};
2677 _NEON2SSE_ALIGN_16
uint64_t mask
[2] = {0xffffffffffffffff, 0xffffffffffffffff};
2678 __m128i vec_masked
, p_masked
;
2681 vec_masked
= _mm_and_si128 (*(__m128i
*)mask
,vec
); //ready for p
2682 p_masked
= _mm_andnot_si128 (*(__m128i
*)mask
,*(__m128i
*)pvec
); //ready for vec
2683 return _mm_or_si128(vec_masked
, p_masked
);
2686 _NEON2SSE_INLINE
int64_t _MM_EXTRACT_EPI64_32(__m128i val
, const int LANE
)
2688 _NEON2SSE_ALIGN_16
int64_t tmp
[2];
2689 _mm_store_si128((__m128i
*)tmp
, val
);
2693 #ifndef _NEON2SSE_64BIT_SSE4
2694 #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
2695 #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
2698 int32x4_t
vqd_s32(int32x4_t a
); //Doubling saturation for signed ints
2699 _NEON2SSE_INLINE int32x4_t
vqd_s32(int32x4_t a
)
2701 //Overflow happens only if a and sum have the opposite signs
2702 __m128i c7fffffff
, res
, res_sat
, res_xor_a
;
2703 c7fffffff
= _mm_set1_epi32(0x7fffffff);
2704 res
= _mm_slli_epi32 (a
, 1); // res = a*2
2705 res_sat
= _mm_srli_epi32(a
, 31);
2706 res_sat
= _mm_add_epi32(res_sat
, c7fffffff
);
2707 res_xor_a
= _mm_xor_si128(res
, a
);
2708 res_xor_a
= _mm_srai_epi32(res_xor_a
,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
2709 res_sat
= _mm_and_si128(res_xor_a
, res_sat
);
2710 res
= _mm_andnot_si128(res_xor_a
, res
);
2711 return _mm_or_si128(res
, res_sat
);
2715 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2716 //*************************************************************************
2717 //*************************************************************************
2718 //***************** Functions redefinition\implementatin starts here *****
2719 //*************************************************************************
2720 //*************************************************************************
2721 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
2723 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
2725 #define vector_addq_s32 _mm_add_epi32
2726 #else //if we have IA
2727 #define vector_addq_s32 vadd_s32
2730 ********************************************************************************************
2731 Functions below are organised in the following way:
2733 Each NEON intrinsic function has one of the following options:
2734 1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
2735 2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
2736 3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
2737 4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
2738 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
2739 - please consider such functions removal from your code.
2742 //***********************************************************************
2743 //************************ Vector add *****************************
2744 //***********************************************************************
2745 int8x8_t
vadd_s8(int8x8_t a
, int8x8_t b
); // VADD.I8 d0,d0,d0
2746 _NEON2SSE_INLINE int8x8_t
vadd_s8(int8x8_t a
, int8x8_t b
)
2749 return64(_mm_add_epi8(_pM128i(a
),_pM128i(b
)));
2753 int16x4_t
vadd_s16(int16x4_t a
, int16x4_t b
); // VADD.I16 d0,d0,d0
2754 _NEON2SSE_INLINE int16x4_t
vadd_s16(int16x4_t a
, int16x4_t b
)
2757 return64(_mm_add_epi16(_pM128i(a
),_pM128i(b
)));
2761 int32x2_t
vadd_s32(int32x2_t a
, int32x2_t b
); // VADD.I32 d0,d0,d0
2762 _NEON2SSE_INLINE int32x2_t
vadd_s32(int32x2_t a
, int32x2_t b
)
2765 return64(_mm_add_epi32(_pM128i(a
),_pM128i(b
)));
2769 int64x1_t
vadd_s64(int64x1_t a
, int64x1_t b
); // VADD.I64 d0,d0,d0
2770 _NEON2SSE_INLINE int64x1_t
vadd_s64(int64x1_t a
, int64x1_t b
)
2773 res64
.m64_i64
[0] = a
.m64_i64
[0] + b
.m64_i64
[0];
2778 float32x2_t
vadd_f32(float32x2_t a
, float32x2_t b
); // VADD.F32 d0,d0,d0
2779 _NEON2SSE_INLINE float32x2_t
vadd_f32(float32x2_t a
, float32x2_t b
)
2783 res
= _mm_add_ps(_pM128(a
),_pM128(b
)); //SSE, use only low 64 bits
2788 uint8x8_t
vadd_u8(uint8x8_t a
, uint8x8_t b
); // VADD.I8 d0,d0,d0
2789 #define vadd_u8 vadd_s8
2791 uint16x4_t
vadd_u16(uint16x4_t a
, uint16x4_t b
); // VADD.I16 d0,d0,d0
2792 #define vadd_u16 vadd_s16
2794 uint32x2_t
vadd_u32(uint32x2_t a
, uint32x2_t b
); // VADD.I32 d0,d0,d0
2795 #define vadd_u32 vadd_s32
2797 uint64x1_t
vadd_u64(uint64x1_t a
, uint64x1_t b
); // VADD.I64 d0,d0,d0
2798 _NEON2SSE_INLINE uint64x1_t
vadd_u64(uint64x1_t a
, uint64x1_t b
)
2801 res64
.m64_u64
[0] = a
.m64_u64
[0] + b
.m64_u64
[0];
2806 int8x16_t
vaddq_s8(int8x16_t a
, int8x16_t b
); // VADD.I8 q0,q0,q0
2807 #define vaddq_s8 _mm_add_epi8
2809 int16x8_t
vaddq_s16(int16x8_t a
, int16x8_t b
); // VADD.I16 q0,q0,q0
2810 #define vaddq_s16 _mm_add_epi16
2812 int32x4_t
vaddq_s32(int32x4_t a
, int32x4_t b
); // VADD.I32 q0,q0,q0
2813 #define vaddq_s32 _mm_add_epi32
2815 int64x2_t
vaddq_s64(int64x2_t a
, int64x2_t b
); // VADD.I64 q0,q0,q0
2816 #define vaddq_s64 _mm_add_epi64
2818 float32x4_t
vaddq_f32(float32x4_t a
, float32x4_t b
); // VADD.F32 q0,q0,q0
2819 #define vaddq_f32 _mm_add_ps
2821 uint8x16_t
vaddq_u8(uint8x16_t a
, uint8x16_t b
); // VADD.I8 q0,q0,q0
2822 #define vaddq_u8 _mm_add_epi8
2824 uint16x8_t
vaddq_u16(uint16x8_t a
, uint16x8_t b
); // VADD.I16 q0,q0,q0
2825 #define vaddq_u16 _mm_add_epi16
2827 uint32x4_t
vaddq_u32(uint32x4_t a
, uint32x4_t b
); // VADD.I32 q0,q0,q0
2828 #define vaddq_u32 _mm_add_epi32
2830 uint64x2_t
vaddq_u64(uint64x2_t a
, uint64x2_t b
); // VADD.I64 q0,q0,q0
2831 #define vaddq_u64 _mm_add_epi64
2833 //**************************** Vector long add *****************************:
2834 //***********************************************************************
2835 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
2836 int16x8_t
vaddl_s8(int8x8_t a
, int8x8_t b
); // VADDL.S8 q0,d0,d0
2837 _NEON2SSE_INLINE int16x8_t
vaddl_s8(int8x8_t a
, int8x8_t b
) // VADDL.S8 q0,d0,d0
2840 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE4.1,
2841 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
2842 return _mm_add_epi16 (a16
, b16
);
2845 int32x4_t
vaddl_s16(int16x4_t a
, int16x4_t b
); // VADDL.S16 q0,d0,d0
2846 _NEON2SSE_INLINE int32x4_t
vaddl_s16(int16x4_t a
, int16x4_t b
) // VADDL.S16 q0,d0,d0
2849 a32
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); //SSE4.1
2850 b32
= _MM_CVTEPI16_EPI32 (_pM128i(b
)); //SSE4.1
2851 return _mm_add_epi32 (a32
, b32
);
2854 int64x2_t
vaddl_s32(int32x2_t a
, int32x2_t b
); // VADDL.S32 q0,d0,d0
2855 _NEON2SSE_INLINE int64x2_t
vaddl_s32(int32x2_t a
, int32x2_t b
) // VADDL.S32 q0,d0,d0
2857 //may be not optimal
2859 a64
= _MM_CVTEPI32_EPI64 (_pM128i(a
)); //SSE4.1
2860 b64
= _MM_CVTEPI32_EPI64 (_pM128i(b
)); //SSE4.1
2861 return _mm_add_epi64 ( a64
, b64
);
2864 uint16x8_t
vaddl_u8(uint8x8_t a
, uint8x8_t b
); // VADDL.U8 q0,d0,d0
2865 _NEON2SSE_INLINE uint16x8_t
vaddl_u8(uint8x8_t a
, uint8x8_t b
) // VADDL.U8 q0,d0,d0
2868 a16
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE4.1
2869 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); //SSE4.1
2870 return _mm_add_epi16 (a16
, b16
);
2873 uint32x4_t
vaddl_u16(uint16x4_t a
, uint16x4_t b
); // VADDL.s16 q0,d0,d0
2874 _NEON2SSE_INLINE uint32x4_t
vaddl_u16(uint16x4_t a
, uint16x4_t b
) // VADDL.s16 q0,d0,d0
2877 a32
= _MM_CVTEPU16_EPI32 (_pM128i(a
)); //SSE4.1
2878 b32
= _MM_CVTEPU16_EPI32 (_pM128i(b
)); //SSE4.1
2879 return _mm_add_epi32 (a32
, b32
);
2882 uint64x2_t
vaddl_u32(uint32x2_t a
, uint32x2_t b
); // VADDL.U32 q0,d0,d0
2883 _NEON2SSE_INLINE uint64x2_t
vaddl_u32(uint32x2_t a
, uint32x2_t b
) // VADDL.U32 q0,d0,d0
2885 //may be not optimal
2887 a64
= _MM_CVTEPU32_EPI64 (_pM128i(a
)); //SSE4.1
2888 b64
= _MM_CVTEPU32_EPI64 (_pM128i(b
)); //SSE4.1
2889 return _mm_add_epi64 (a64
, b64
);
2892 //*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
2893 //*************** *********************************************************************
2894 int16x8_t
vaddw_s8(int16x8_t a
, int8x8_t b
); // VADDW.S8 q0,q0,d0
2895 _NEON2SSE_INLINE int16x8_t
vaddw_s8(int16x8_t a
, int8x8_t b
) // VADDW.S8 q0,q0,d0
2898 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
2899 return _mm_add_epi16 (a
, b16
);
2902 int32x4_t
vaddw_s16(int32x4_t a
, int16x4_t b
); // VADDW.S16 q0,q0,d0
2903 _NEON2SSE_INLINE int32x4_t
vaddw_s16(int32x4_t a
, int16x4_t b
) // VADDW.S16 q0,q0,d0
2906 b32
= _MM_CVTEPI16_EPI32(_pM128i(b
)); //SSE4.1,
2907 return _mm_add_epi32 (a
, b32
);
2910 int64x2_t
vaddw_s32(int64x2_t a
, int32x2_t b
); // VADDW.S32 q0,q0,d0
2911 _NEON2SSE_INLINE int64x2_t
vaddw_s32(int64x2_t a
, int32x2_t b
) // VADDW.S32 q0,q0,d0
2914 b64
= _MM_CVTEPI32_EPI64 (_pM128i(b
)); //SSE4.1
2915 return _mm_add_epi64 (a
, b64
);
2918 uint16x8_t
vaddw_u8(uint16x8_t a
, uint8x8_t b
); // VADDW.U8 q0,q0,d0
2919 _NEON2SSE_INLINE uint16x8_t
vaddw_u8(uint16x8_t a
, uint8x8_t b
) // VADDW.U8 q0,q0,d0
2922 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); //SSE4.1
2923 return _mm_add_epi16 (a
, b16
);
2926 uint32x4_t
vaddw_u16(uint32x4_t a
, uint16x4_t b
); // VADDW.s16 q0,q0,d0
2927 _NEON2SSE_INLINE uint32x4_t
vaddw_u16(uint32x4_t a
, uint16x4_t b
) // VADDW.s16 q0,q0,d0
2930 b32
= _MM_CVTEPU16_EPI32 (_pM128i(b
)); //SSE4.1
2931 return _mm_add_epi32 (a
, b32
);
2934 uint64x2_t
vaddw_u32(uint64x2_t a
, uint32x2_t b
); // VADDW.U32 q0,q0,d0
2935 _NEON2SSE_INLINE uint64x2_t
vaddw_u32(uint64x2_t a
, uint32x2_t b
) // VADDW.U32 q0,q0,d0
2938 b64
= _MM_CVTEPU32_EPI64 (_pM128i(b
)); //SSE4.1
2939 return _mm_add_epi64 (a
, b64
);
2942 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated *******************************
2943 //*************************************************************************************************************************
2944 int8x8_t
vhadd_s8(int8x8_t a
, int8x8_t b
); // VHADD.S8 d0,d0,d0
2945 _NEON2SSE_INLINE int8x8_t
vhadd_s8(int8x8_t a
, int8x8_t b
)
2948 return64(vhaddq_u8(_pM128i(a
), _pM128i(b
)));
2952 int16x4_t
vhadd_s16(int16x4_t a
, int16x4_t b
); // VHADD.S16 d0,d0,d0
2953 _NEON2SSE_INLINE int16x4_t
vhadd_s16(int16x4_t a
, int16x4_t b
)
2956 return64( vhaddq_s16(_pM128i(a
), _pM128i(b
)));
2960 int32x2_t
vhadd_s32(int32x2_t a
, int32x2_t b
); // VHADD.S32 d0,d0,d0
2961 _NEON2SSE_INLINE int32x2_t
vhadd_s32(int32x2_t a
, int32x2_t b
)
2964 return64( vhaddq_s32(_pM128i(a
), _pM128i(b
)));
2968 uint8x8_t
vhadd_u8(uint8x8_t a
, uint8x8_t b
); // VHADD.w d0,d0,d0
2969 _NEON2SSE_INLINE uint8x8_t
vhadd_u8(uint8x8_t a
, uint8x8_t b
)
2972 return64( vhaddq_u8(_pM128i(a
), _pM128i(b
)));
2976 uint16x4_t
vhadd_u16(uint16x4_t a
, uint16x4_t b
); // VHADD.s16 d0,d0,d0
2977 _NEON2SSE_INLINE uint16x4_t
vhadd_u16(uint16x4_t a
, uint16x4_t b
)
2980 return64( vhaddq_u16(_pM128i(a
), _pM128i(b
)));
2984 uint32x2_t
vhadd_u32(uint32x2_t a
, uint32x2_t b
); // VHADD.U32 d0,d0,d0
2985 _NEON2SSE_INLINE uint32x2_t
vhadd_u32(uint32x2_t a
, uint32x2_t b
)
2988 return64( vhaddq_u32(_pM128i(a
), _pM128i(b
)));
2992 int8x16_t
vhaddq_s8(int8x16_t a
, int8x16_t b
); // VHADD.S8 q0,q0,q0
2993 _NEON2SSE_INLINE int8x16_t
vhaddq_s8(int8x16_t a
, int8x16_t b
)
2995 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
2997 tmp1
= _mm_and_si128(a
,b
);
2998 tmp2
= _mm_xor_si128(a
,b
);
2999 tmp2
= vshrq_n_s8(tmp2
,1);
3000 return _mm_add_epi8(tmp1
,tmp2
);
3003 int16x8_t
vhaddq_s16(int16x8_t a
, int16x8_t b
); // VHADD.S1 6 q0,q0,q0
3004 _NEON2SSE_INLINE int16x8_t
vhaddq_s16(int16x8_t a
, int16x8_t b
)
3006 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3008 tmp1
= _mm_and_si128(a
,b
);
3009 tmp2
= _mm_xor_si128(a
,b
);
3010 tmp2
= _mm_srai_epi16(tmp2
,1);
3011 return _mm_add_epi16(tmp1
,tmp2
);
3014 int32x4_t
vhaddq_s32(int32x4_t a
, int32x4_t b
); // VHADD.S32 q0,q0,q0
3015 _NEON2SSE_INLINE int32x4_t
vhaddq_s32(int32x4_t a
, int32x4_t b
) // VHADD.S32 q0,q0,q0
3017 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3019 tmp1
= _mm_and_si128(a
,b
);
3020 tmp2
= _mm_xor_si128(a
,b
);
3021 tmp2
= _mm_srai_epi32(tmp2
,1);
3022 return _mm_add_epi32(tmp1
,tmp2
);
3025 uint8x16_t
vhaddq_u8(uint8x16_t a
, uint8x16_t b
); // VHADD.U8 q0,q0,q0
3026 _NEON2SSE_INLINE uint8x16_t
vhaddq_u8(uint8x16_t a
, uint8x16_t b
) // VHADD.U8 q0,q0,q0
3028 __m128i c1
, sum
, res
;
3029 c1
= _mm_set1_epi8(1);
3030 sum
= _mm_avg_epu8(a
, b
); //result is rounded, need to compensate it
3031 res
= _mm_xor_si128(a
, b
); //for rounding compensation
3032 res
= _mm_and_si128(res
,c1
); //for rounding compensation
3033 return _mm_sub_epi8 (sum
, res
); //actual rounding compensation
3036 uint16x8_t
vhaddq_u16(uint16x8_t a
, uint16x8_t b
); // VHADD.s16 q0,q0,q0
3037 _NEON2SSE_INLINE uint16x8_t
vhaddq_u16(uint16x8_t a
, uint16x8_t b
) // VHADD.s16 q0,q0,q0
3040 sum
= _mm_avg_epu16(a
, b
); //result is rounded, need to compensate it
3041 res
= _mm_xor_si128(a
, b
); //for rounding compensation
3042 res
= _mm_slli_epi16 (res
,15); //shift left then back right to
3043 res
= _mm_srli_epi16 (res
,15); //get 1 or zero
3044 return _mm_sub_epi16 (sum
, res
); //actual rounding compensation
3047 uint32x4_t
vhaddq_u32(uint32x4_t a
, uint32x4_t b
); // VHADD.U32 q0,q0,q0
3048 _NEON2SSE_INLINE uint32x4_t
vhaddq_u32(uint32x4_t a
, uint32x4_t b
) // VHADD.U32 q0,q0,q0
3050 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
3052 tmp1
= _mm_and_si128(a
,b
);
3053 tmp2
= _mm_xor_si128(a
,b
);
3054 tmp2
= _mm_srli_epi32(tmp2
,1);
3055 return _mm_add_epi32(tmp1
,tmp2
);
3058 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 ***************************
3059 //*****************************************************************************************************************************
3060 int8x8_t
vrhadd_s8(int8x8_t a
, int8x8_t b
); // VRHADD.S8 d0,d0,d0
3061 _NEON2SSE_INLINE int8x8_t
vrhadd_s8(int8x8_t a
, int8x8_t b
)
3064 return64(vrhaddq_s8(_pM128i(a
), _pM128i(b
)));
3068 int16x4_t
vrhadd_s16(int16x4_t a
, int16x4_t b
); // VRHADD.S16 d0,d0,d0
3069 _NEON2SSE_INLINE int16x4_t
vrhadd_s16(int16x4_t a
, int16x4_t b
)
3072 return64(vrhaddq_s16(_pM128i(a
), _pM128i(b
)));
3076 int32x2_t
vrhadd_s32(int32x2_t a
, int32x2_t b
); // VRHADD.S32 d0,d0,d0
3077 _NEON2SSE_INLINE int32x2_t
vrhadd_s32(int32x2_t a
, int32x2_t b
)
3080 return64(vrhaddq_s32(_pM128i(a
), _pM128i(b
)));
3084 uint8x8_t
vrhadd_u8(uint8x8_t a
, uint8x8_t b
); // VRHADD.U8 d0,d0,d0
3085 _NEON2SSE_INLINE uint8x8_t
vrhadd_u8(uint8x8_t a
, uint8x8_t b
)
3088 return64(_mm_avg_epu8(_pM128i(a
),_pM128i(b
))); //SSE, result rounding!!!
3092 uint16x4_t
vrhadd_u16(uint16x4_t a
, uint16x4_t b
); // VRHADD.s16 d0,d0,d0
3093 _NEON2SSE_INLINE uint16x4_t
vrhadd_u16(uint16x4_t a
, uint16x4_t b
)
3096 return64(_mm_avg_epu16(_pM128i(a
),_pM128i(b
))); //SSE, result rounding!!!
3100 uint32x2_t
vrhadd_u32(uint32x2_t a
, uint32x2_t b
); // VRHADD.U32 d0,d0,d0
3101 _NEON2SSE_INLINE uint32x2_t
vrhadd_u32(uint32x2_t a
, uint32x2_t b
)
3104 return64(vrhaddq_u32(_pM128i(a
), _pM128i(b
)));
3108 int8x16_t
vrhaddq_s8(int8x16_t a
, int8x16_t b
); // VRHADD.S8 q0,q0,q0
3109 _NEON2SSE_INLINE int8x16_t
vrhaddq_s8(int8x16_t a
, int8x16_t b
) // VRHADD.S8 q0,q0,q0
3111 //no signed average in x86 SIMD, go to unsigned
3112 __m128i c128
, au
, bu
, sum
;
3113 c128
= _mm_set1_epi8(0x80); //-128
3114 au
= _mm_sub_epi8(a
, c128
); //add 128
3115 bu
= _mm_sub_epi8(b
, c128
); //add 128
3116 sum
= _mm_avg_epu8(au
, bu
);
3117 return _mm_add_epi8 (sum
, c128
); //sub 128
3120 int16x8_t
vrhaddq_s16(int16x8_t a
, int16x8_t b
); // VRHADD.S16 q0,q0,q0
3121 _NEON2SSE_INLINE int16x8_t
vrhaddq_s16(int16x8_t a
, int16x8_t b
) // VRHADD.S16 q0,q0,q0
3123 //no signed average in x86 SIMD, go to unsigned
3124 __m128i cx8000
, au
, bu
, sum
;
3125 cx8000
= _mm_set1_epi16(0x8000); // - 32768
3126 au
= _mm_sub_epi16(a
, cx8000
); //add 32768
3127 bu
= _mm_sub_epi16(b
, cx8000
); //add 32768
3128 sum
= _mm_avg_epu16(au
, bu
);
3129 return _mm_add_epi16 (sum
, cx8000
); //sub 32768
3132 int32x4_t
vrhaddq_s32(int32x4_t a
, int32x4_t b
); // VRHADD.S32 q0,q0,q0
3133 _NEON2SSE_INLINE int32x4_t
vrhaddq_s32(int32x4_t a
, int32x4_t b
)
3135 //need to avoid overflow
3136 __m128i a2
, b2
, res
, sum
;
3137 a2
= _mm_srai_epi32(a
,1); //a2=a/2;
3138 b2
= _mm_srai_epi32(b
,1); // b2=b/2;
3139 res
= _mm_or_si128(a
,b
); //for rounding
3140 res
= _mm_slli_epi32 (res
,31); //shift left then back right to
3141 res
= _mm_srli_epi32 (res
,31); //get 1 or zero
3142 sum
= _mm_add_epi32(a2
,b2
);
3143 return _mm_add_epi32(sum
,res
);
3146 uint8x16_t
vrhaddq_u8(uint8x16_t a
, uint8x16_t b
); // VRHADD.U8 q0,q0,q0
3147 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
3149 uint16x8_t
vrhaddq_u16(uint16x8_t a
, uint16x8_t b
); // VRHADD.s16 q0,q0,q0
3150 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
3153 uint32x4_t
vrhaddq_u32(uint32x4_t a
, uint32x4_t b
); // VRHADD.U32 q0,q0,q0
3154 _NEON2SSE_INLINE uint32x4_t
vrhaddq_u32(uint32x4_t a
, uint32x4_t b
) // VRHADD.U32 q0,q0,q0
3156 //need to avoid overflow
3157 __m128i a2
, b2
, res
, sum
;
3158 a2
= _mm_srli_epi32(a
,1); //a2=a/2;
3159 b2
= _mm_srli_epi32(b
,1); // b2=b/2;
3160 res
= _mm_or_si128(a
,b
); //for rounding
3161 res
= _mm_slli_epi32 (res
,31); //shift left then back right to
3162 res
= _mm_srli_epi32 (res
,31); //get 1 or zero
3163 sum
= _mm_add_epi32(a2
,b2
);
3164 return _mm_add_epi32(sum
,res
);
3167 //****************** VQADD: Vector saturating add ************************
3168 //************************************************************************
3169 int8x8_t
vqadd_s8(int8x8_t a
, int8x8_t b
); // VQADD.S8 d0,d0,d0
3170 _NEON2SSE_INLINE int8x8_t
vqadd_s8(int8x8_t a
, int8x8_t b
)
3173 return64(_mm_adds_epi8(_pM128i(a
),_pM128i(b
)));
3177 int16x4_t
vqadd_s16(int16x4_t a
, int16x4_t b
); // VQADD.S16 d0,d0,d0
3178 _NEON2SSE_INLINE int16x4_t
vqadd_s16(int16x4_t a
, int16x4_t b
)
3181 return64(_mm_adds_epi16(_pM128i(a
),_pM128i(b
)));
3185 int32x2_t
vqadd_s32(int32x2_t a
, int32x2_t b
); // VQADD.S32 d0,d0,d0
3186 _NEON2SSE_INLINE int32x2_t
vqadd_s32(int32x2_t a
, int32x2_t b
)
3189 return64(vqaddq_s32(_pM128i(a
), _pM128i(b
)));
3193 int64x1_t
vqadd_s64(int64x1_t a
, int64x1_t b
); // VQADD.S64 d0,d0,d0
3194 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vqadd_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
3200 res
.m64_u64
[0] = a64
+ b64
;
3201 a64
= (a64
>> 63) + (~_SIGNBIT64
);
3202 if ((int64_t)((b64
^ a64
) | ~(res
.m64_u64
[0] ^ b64
))>=0) {
3203 res
.m64_u64
[0] = a64
;
3208 uint8x8_t
vqadd_u8(uint8x8_t a
, uint8x8_t b
); // VQADD.U8 d0,d0,d0
3209 _NEON2SSE_INLINE uint8x8_t
vqadd_u8(uint8x8_t a
, uint8x8_t b
)
3212 return64(_mm_adds_epu8(_pM128i(a
),_pM128i(b
)));
3216 uint16x4_t
vqadd_u16(uint16x4_t a
, uint16x4_t b
); // VQADD.s16 d0,d0,d0
3217 _NEON2SSE_INLINE uint16x4_t
vqadd_u16(uint16x4_t a
, uint16x4_t b
)
3220 return64(_mm_adds_epu16(_pM128i(a
),_pM128i(b
)));
3224 uint32x2_t
vqadd_u32(uint32x2_t a
, uint32x2_t b
); // VQADD.U32 d0,d0,d0
3225 _NEON2SSE_INLINE uint32x2_t
vqadd_u32(uint32x2_t a
, uint32x2_t b
)
3228 return64(vqaddq_u32(_pM128i(a
), _pM128i(b
)));
3232 uint64x1_t
vqadd_u64(uint64x1_t a
, uint64x1_t b
); // VQADD.U64 d0,d0,d0
3233 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqadd_u64(uint64x1_t a
, uint64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
3235 _NEON2SSE_ALIGN_16
uint64_t a64
, b64
;
3239 res
.m64_u64
[0] = a64
+ b64
;
3240 if (res
.m64_u64
[0] < a64
) {
3241 res
.m64_u64
[0] = ~(uint64_t)0;
3246 int8x16_t
vqaddq_s8(int8x16_t a
, int8x16_t b
); // VQADD.S8 q0,q0,q0
3247 #define vqaddq_s8 _mm_adds_epi8
3249 int16x8_t
vqaddq_s16(int16x8_t a
, int16x8_t b
); // VQADD.S16 q0,q0,q0
3250 #define vqaddq_s16 _mm_adds_epi16
3252 int32x4_t
vqaddq_s32(int32x4_t a
, int32x4_t b
); // VQADD.S32 q0,q0,q0
3253 _NEON2SSE_INLINE int32x4_t
vqaddq_s32(int32x4_t a
, int32x4_t b
)
3255 //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
3256 __m128i c7fffffff
, res
, res_sat
, res_xor_a
, b_xor_a_
;
3257 c7fffffff
= _mm_set1_epi32(0x7fffffff);
3258 res
= _mm_add_epi32(a
, b
);
3259 res_sat
= _mm_srli_epi32(a
, 31);
3260 res_sat
= _mm_add_epi32(res_sat
, c7fffffff
);
3261 res_xor_a
= _mm_xor_si128(res
, a
);
3262 b_xor_a_
= _mm_xor_si128(b
, a
);
3263 res_xor_a
= _mm_andnot_si128(b_xor_a_
, res_xor_a
);
3264 res_xor_a
= _mm_srai_epi32(res_xor_a
,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
3265 res_sat
= _mm_and_si128(res_xor_a
, res_sat
);
3266 res
= _mm_andnot_si128(res_xor_a
, res
);
3267 return _mm_or_si128(res
, res_sat
);
3270 int64x2_t
vqaddq_s64(int64x2_t a
, int64x2_t b
); // VQADD.S64 q0,q0,q0
3271 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqaddq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
3273 _NEON2SSE_ALIGN_16
uint64_t atmp
[2], btmp
[2], res
[2];
3274 _mm_store_si128((__m128i
*)atmp
, a
);
3275 _mm_store_si128((__m128i
*)btmp
, b
);
3276 res
[0] = atmp
[0] + btmp
[0];
3277 res
[1] = atmp
[1] + btmp
[1];
3279 atmp
[0] = (atmp
[0] >> 63) + (~_SIGNBIT64
);
3280 atmp
[1] = (atmp
[1] >> 63) + (~_SIGNBIT64
);
3282 if ((int64_t)((btmp
[0] ^ atmp
[0]) | ~(res
[0] ^ btmp
[0]))>=0) {
3285 if ((int64_t)((btmp
[1] ^ atmp
[1]) | ~(res
[1] ^ btmp
[1]))>=0) {
3288 return _mm_load_si128((__m128i
*)res
);
3291 uint8x16_t
vqaddq_u8(uint8x16_t a
, uint8x16_t b
); // VQADD.U8 q0,q0,q0
3292 #define vqaddq_u8 _mm_adds_epu8
3294 uint16x8_t
vqaddq_u16(uint16x8_t a
, uint16x8_t b
); // VQADD.s16 q0,q0,q0
3295 #define vqaddq_u16 _mm_adds_epu16
3297 uint32x4_t
vqaddq_u32(uint32x4_t a
, uint32x4_t b
); // VQADD.U32 q0,q0,q0
3298 _NEON2SSE_INLINE uint32x4_t
vqaddq_u32(uint32x4_t a
, uint32x4_t b
)
3300 __m128i c80000000
, cmp
, subsum
, suba
, sum
;
3301 c80000000
= _mm_set1_epi32 (0x80000000);
3302 sum
= _mm_add_epi32 (a
, b
);
3303 subsum
= _mm_sub_epi32 (sum
, c80000000
);
3304 suba
= _mm_sub_epi32 (a
, c80000000
);
3305 cmp
= _mm_cmpgt_epi32 ( suba
, subsum
); //no unsigned comparison, need to go to signed
3306 return _mm_or_si128 (sum
, cmp
); //saturation
3309 uint64x2_t
vqaddq_u64(uint64x2_t a
, uint64x2_t b
); // VQADD.U64 q0,q0,q0
3311 _NEON2SSE_INLINE uint64x2_t
vqaddq_u64(uint64x2_t a
, uint64x2_t b
)
3313 __m128i c80000000
, sum
, cmp
, suba
, subsum
;
3314 c80000000
= _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
3315 sum
= _mm_add_epi64 (a
, b
);
3316 subsum
= _mm_sub_epi64 (sum
, c80000000
);
3317 suba
= _mm_sub_epi64 (a
, c80000000
);
3318 cmp
= _mm_cmpgt_epi64 ( suba
, subsum
); //no unsigned comparison, need to go to signed, SSE4.2!!!
3319 return _mm_or_si128 (sum
, cmp
); //saturation
3322 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqaddq_u64(uint64x2_t a
, uint64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
3324 _NEON2SSE_ALIGN_16
uint64_t atmp
[2], btmp
[2], res
[2];
3325 _mm_store_si128((__m128i
*)atmp
, a
);
3326 _mm_store_si128((__m128i
*)btmp
, b
);
3327 res
[0] = atmp
[0] + btmp
[0];
3328 res
[1] = atmp
[1] + btmp
[1];
3329 if (res
[0] < atmp
[0]) res
[0] = ~(uint64_t)0;
3330 if (res
[1] < atmp
[1]) res
[1] = ~(uint64_t)0;
3331 return _mm_load_si128((__m128i
*)(res
));
3336 //******************* Vector add high half (truncated) ******************
3337 //************************************************************************
3338 int8x8_t
vaddhn_s16(int16x8_t a
, int16x8_t b
); // VADDHN.I16 d0,q0,q0
3339 _NEON2SSE_INLINE int8x8_t
vaddhn_s16(int16x8_t a
, int16x8_t b
) // VADDHN.I16 d0,q0,q0
3343 sum
= _mm_add_epi16 (a
, b
);
3344 sum
= _mm_srai_epi16 (sum
, 8);
3345 sum
= _mm_packs_epi16 (sum
, sum
); //use 64 low bits only
3349 int16x4_t
vaddhn_s32(int32x4_t a
, int32x4_t b
); // VADDHN.I32 d0,q0,q0
3350 _NEON2SSE_INLINE int16x4_t
vaddhn_s32(int32x4_t a
, int32x4_t b
) // VADDHN.I32 d0,q0,q0
3354 sum
= _mm_add_epi32 (a
, b
);
3355 sum
= _mm_srai_epi32(sum
, 16);
3356 sum
= _mm_packs_epi32 (sum
, sum
); //use 64 low bits only
3360 int32x2_t
vaddhn_s64(int64x2_t a
, int64x2_t b
); // VADDHN.I64 d0,q0,q0
3361 _NEON2SSE_INLINE int32x2_t
vaddhn_s64(int64x2_t a
, int64x2_t b
)
3365 sum
= _mm_add_epi64 (a
, b
);
3366 sum
= _mm_shuffle_epi32(sum
, 1 | (3 << 2) | (0 << 4) | (2 << 6));
3370 uint8x8_t
vaddhn_u16(uint16x8_t a
, uint16x8_t b
); // VADDHN.I16 d0,q0,q0
3371 _NEON2SSE_INLINE uint8x8_t
vaddhn_u16(uint16x8_t a
, uint16x8_t b
) // VADDHN.I16 d0,q0,q0
3375 sum
= _mm_add_epi16 (a
, b
);
3376 sum
= _mm_srli_epi16 (sum
, 8);
3377 sum
= _mm_packus_epi16 (sum
,sum
); //use 64 low bits only
3381 uint16x4_t
vaddhn_u32(uint32x4_t a
, uint32x4_t b
); // VADDHN.I32 d0,q0,q0
3382 _NEON2SSE_INLINE uint16x4_t
vaddhn_u32(uint32x4_t a
, uint32x4_t b
) // VADDHN.I32 d0,q0,q0
3386 sum
= _mm_add_epi32 (a
, b
);
3387 sum
= _mm_srli_epi32 (sum
, 16);
3388 sum
= _MM_PACKUS1_EPI32 (sum
); //use 64 low bits only
3392 uint32x2_t
vaddhn_u64(uint64x2_t a
, uint64x2_t b
); // VADDHN.I64 d0,q0,q0
3393 #define vaddhn_u64 vaddhn_s64
3395 //*********** Vector rounding add high half: vraddhn_<type> ******************.
3396 //***************************************************************************
3397 int8x8_t
vraddhn_s16(int16x8_t a
, int16x8_t b
); // VRADDHN.I16 d0,q0,q0
3398 _NEON2SSE_INLINE int8x8_t
vraddhn_s16(int16x8_t a
, int16x8_t b
) // VRADDHN.I16 d0,q0,q0
3402 sum
= _mm_add_epi16 (a
, b
);
3403 mask1
= _mm_slli_epi16(sum
, 9); //shift left then back right to
3404 mask1
= _mm_srli_epi16(mask1
, 15); //get 7-th bit 1 or zero
3405 sum
= _mm_srai_epi16 (sum
, 8); //get high half
3406 sum
= _mm_add_epi16 (sum
, mask1
); //actual rounding
3407 sum
= _mm_packs_epi16 (sum
, sum
);
3411 int16x4_t
vraddhn_s32(int32x4_t a
, int32x4_t b
); // VRADDHN.I32 d0,q0,q0
3412 _NEON2SSE_INLINE int16x4_t
vraddhn_s32(int32x4_t a
, int32x4_t b
) // VRADDHN.I32 d0,q0,q0
3414 //SIMD may be not optimal, serial may be faster
3417 sum
= _mm_add_epi32 (a
, b
);
3418 mask1
= _mm_slli_epi32(sum
, 17); //shift left then back right to
3419 mask1
= _mm_srli_epi32(mask1
,31); //get 15-th bit 1 or zero
3420 sum
= _mm_srai_epi32 (sum
, 16); //get high half
3421 sum
= _mm_add_epi32 (sum
, mask1
); //actual rounding
3422 sum
= _mm_packs_epi32 (sum
, sum
);
3426 int32x2_t
vraddhn_s64(int64x2_t a
, int64x2_t b
); // VRADDHN.I64 d0,q0,q0
3427 _NEON2SSE_INLINE int32x2_t
vraddhn_s64(int64x2_t a
, int64x2_t b
)
3429 //SIMD may be not optimal, serial may be faster
3432 sum
= _mm_add_epi64 (a
, b
);
3433 mask1
= _mm_slli_epi64(sum
, 33); //shift left then back right to
3434 mask1
= _mm_srli_epi64(mask1
,32); //get 31-th bit 1 or zero
3435 sum
= _mm_add_epi64 (sum
, mask1
); //actual high half rounding
3436 sum
= _mm_shuffle_epi32(sum
, 1 | (3 << 2) | (1 << 4) | (3 << 6));
3440 uint8x8_t
vraddhn_u16(uint16x8_t a
, uint16x8_t b
); // VRADDHN.I16 d0,q0,q0
3441 _NEON2SSE_INLINE uint8x8_t
vraddhn_u16(uint16x8_t a
, uint16x8_t b
) // VRADDHN.I16 d0,q0,q0
3445 sum
= _mm_add_epi16 (a
, b
);
3446 mask1
= _mm_slli_epi16(sum
, 9); //shift left then back right to
3447 mask1
= _mm_srli_epi16(mask1
, 15); //get 7-th bit 1 or zero
3448 sum
= _mm_srai_epi16 (sum
, 8); //get high half
3449 sum
= _mm_add_epi16 (sum
, mask1
); //actual rounding
3450 sum
= _mm_packus_epi16 (sum
, sum
);
3454 uint16x4_t
vraddhn_u32(uint32x4_t a
, uint32x4_t b
); // VRADDHN.I32 d0,q0,q0
3455 _NEON2SSE_INLINE uint16x4_t
vraddhn_u32(uint32x4_t a
, uint32x4_t b
)
3457 //SIMD may be not optimal, serial may be faster
3460 sum
= _mm_add_epi32 (a
, b
);
3461 mask1
= _mm_slli_epi32(sum
, 17); //shift left then back right to
3462 mask1
= _mm_srli_epi32(mask1
,31); //get 15-th bit 1 or zero
3463 sum
= _mm_srai_epi32 (sum
, 16); //get high half
3464 sum
= _mm_add_epi32 (sum
, mask1
); //actual rounding
3465 sum
= _MM_PACKUS1_EPI32 (sum
);
3469 uint32x2_t
vraddhn_u64(uint64x2_t a
, uint64x2_t b
); // VRADDHN.I64 d0,q0,q0
3470 #define vraddhn_u64 vraddhn_s64
3472 //**********************************************************************************
3473 //********* Multiplication *************************************
3474 //**************************************************************************************
3476 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
3477 //As we don't go to wider result functions are equal to "multiply low" in x86
3478 int8x8_t
vmul_s8(int8x8_t a
, int8x8_t b
); // VMUL.I8 d0,d0,d0
3479 _NEON2SSE_INLINE int8x8_t
vmul_s8(int8x8_t a
, int8x8_t b
) // VMUL.I8 d0,d0,d0
3481 // no 8 bit simd multiply, need to go to 16 bits in SSE
3483 __m128i a128
, b128
, res
;
3484 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3485 a128
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); // SSE 4.1 use low 64 bits
3486 b128
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); // SSE 4.1 use low 64 bits
3487 res
= _mm_mullo_epi16 (a128
, b128
);
3488 res
= _mm_shuffle_epi8 (res
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bit from 16, use 64 low bits only
3492 int16x4_t
vmul_s16(int16x4_t a
, int16x4_t b
); // VMUL.I16 d0,d0,d0
3493 #define vmul_s16 vmul_u16
3495 int32x2_t
vmul_s32(int32x2_t a
, int32x2_t b
); // VMUL.I32 d0,d0,d0
3496 #define vmul_s32 vmul_u32
3498 float32x2_t
vmul_f32(float32x2_t a
, float32x2_t b
); // VMUL.F32 d0,d0,d0
3499 _NEON2SSE_INLINE float32x2_t
vmul_f32(float32x2_t a
, float32x2_t b
)
3503 tmp
= _mm_mul_ps(_pM128(a
),_pM128(b
));
3504 _M64f(res64
, tmp
); //use low 64 bits
3508 uint8x8_t
vmul_u8(uint8x8_t a
, uint8x8_t b
); // VMUL.I8 d0,d0,d0
3509 _NEON2SSE_INLINE uint8x8_t
vmul_u8(uint8x8_t a
, uint8x8_t b
) // VMUL.I8 d0,d0,d0
3511 // no 8 bit simd multiply, need to go to 16 bits in SSE
3513 __m128i mask
, a128
, b128
, res
;
3514 mask
= _mm_set1_epi16(0xff);
3515 a128
= _MM_CVTEPU8_EPI16 (_pM128i(a
));
3516 b128
= _MM_CVTEPU8_EPI16 (_pM128i(b
));
3517 res
= _mm_mullo_epi16 (a128
, b128
);
3518 res
= _mm_and_si128(res
, mask
); //to avoid saturation
3519 res
= _mm_packus_epi16 (res
,res
); //use only low 64 bits
3523 uint16x4_t
vmul_u16(uint16x4_t a
, uint16x4_t b
); // VMUL.I16 d0,d0,d0
3524 _NEON2SSE_INLINE uint16x4_t
vmul_u16(uint16x4_t a
, uint16x4_t b
)
3527 return64(_mm_mullo_epi16(_pM128i(a
),_pM128i(b
)));
3531 uint32x2_t
vmul_u32(uint32x2_t a
, uint32x2_t b
); // VMUL.I32 d0,d0,d0
3532 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING( uint32x2_t
vmul_u32(uint32x2_t a
, uint32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
3535 res
.m64_u32
[0] = a
.m64_u32
[0] * b
.m64_u32
[0];
3536 res
.m64_u32
[1] = a
.m64_u32
[1] * b
.m64_u32
[1];
3540 poly8x8_t
vmul_p8(poly8x8_t a
, poly8x8_t b
); // VMUL.P8 d0,d0,d0
3541 _NEON2SSE_INLINE poly8x8_t
vmul_p8(poly8x8_t a
, poly8x8_t b
)
3545 __m128i a64
, b64
, c1
, res
, tmp
, bmasked
;
3549 c1
= _mm_cmpeq_epi8 (a64
,a64
); //all ones 0xff....
3550 c1
= vshrq_n_u8(c1
,7); //0x1
3551 bmasked
= _mm_and_si128(b64
, c1
); //0x1
3552 res
= vmulq_u8(a64
, bmasked
);
3553 for(i
= 1; i
<8; i
++) {
3554 c1
= _mm_slli_epi16(c1
,1); //shift mask left by 1, 16 bit shift is OK here
3555 bmasked
= _mm_and_si128(b64
, c1
); //0x1
3556 tmp
= vmulq_u8(a64
, bmasked
);
3557 res
= _mm_xor_si128(res
, tmp
);
3562 int8x16_t
vmulq_s8(int8x16_t a
, int8x16_t b
); // VMUL.I8 q0,q0,q0
3563 _NEON2SSE_INLINE int8x16_t
vmulq_s8(int8x16_t a
, int8x16_t b
) // VMUL.I8 q0,q0,q0
3565 // no 8 bit simd multiply, need to go to 16 bits
3566 //solution may be not optimal
3567 __m128i a16
, b16
, r16_1
, r16_2
;
3568 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3569 a16
= _MM_CVTEPI8_EPI16 (a
); // SSE 4.1
3570 b16
= _MM_CVTEPI8_EPI16 (b
); // SSE 4.1
3571 r16_1
= _mm_mullo_epi16 (a16
, b16
);
3572 //swap hi and low part of a and b to process the remaining data
3573 a16
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
3574 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
3575 a16
= _MM_CVTEPI8_EPI16 (a16
); // SSE 4.1
3576 b16
= _MM_CVTEPI8_EPI16 (b16
); // SSE 4.1 __m128i r16_2
3578 r16_2
= _mm_mullo_epi16 (a16
, b16
);
3579 r16_1
= _mm_shuffle_epi8 (r16_1
, *(__m128i
*)mask8_16_even_odd
); //return to 8 bit
3580 r16_2
= _mm_shuffle_epi8 (r16_2
, *(__m128i
*)mask8_16_even_odd
); //return to 8 bit
3582 return _mm_unpacklo_epi64(r16_1
, r16_2
);
3585 int16x8_t
vmulq_s16(int16x8_t a
, int16x8_t b
); // VMUL.I16 q0,q0,q0
3586 #define vmulq_s16 _mm_mullo_epi16
3588 int32x4_t
vmulq_s32(int32x4_t a
, int32x4_t b
); // VMUL.I32 q0,q0,q0
3589 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
3591 float32x4_t
vmulq_f32(float32x4_t a
, float32x4_t b
); // VMUL.F32 q0,q0,q0
3592 #define vmulq_f32 _mm_mul_ps
3594 uint8x16_t
vmulq_u8(uint8x16_t a
, uint8x16_t b
); // VMUL.I8 q0,q0,q0
3595 _NEON2SSE_INLINE uint8x16_t
vmulq_u8(uint8x16_t a
, uint8x16_t b
) // VMUL.I8 q0,q0,q0
3597 // no 8 bit simd multiply, need to go to 16 bits
3598 //solution may be not optimal
3599 __m128i maskff
, a16
, b16
, r16_1
, r16_2
;
3600 maskff
= _mm_set1_epi16(0xff);
3601 a16
= _MM_CVTEPU8_EPI16 (a
); // SSE 4.1
3602 b16
= _MM_CVTEPU8_EPI16 (b
); // SSE 4.1
3603 r16_1
= _mm_mullo_epi16 (a16
, b16
);
3604 r16_1
= _mm_and_si128(r16_1
, maskff
); //to avoid saturation
3605 //swap hi and low part of a and b to process the remaining data
3606 a16
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
3607 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
3608 a16
= _MM_CVTEPI8_EPI16 (a16
); // SSE 4.1
3609 b16
= _MM_CVTEPI8_EPI16 (b16
); // SSE 4.1
3611 r16_2
= _mm_mullo_epi16 (a16
, b16
);
3612 r16_2
= _mm_and_si128(r16_2
, maskff
); //to avoid saturation
3613 return _mm_packus_epi16 (r16_1
, r16_2
);
3616 uint16x8_t
vmulq_u16(uint16x8_t a
, uint16x8_t b
); // VMUL.I16 q0,q0,q0
3617 #define vmulq_u16 _mm_mullo_epi16
3619 uint32x4_t
vmulq_u32(uint32x4_t a
, uint32x4_t b
); // VMUL.I32 q0,q0,q0
3620 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
3622 poly8x16_t
vmulq_p8(poly8x16_t a
, poly8x16_t b
); // VMUL.P8 q0,q0,q0
3623 _NEON2SSE_INLINE poly8x16_t
vmulq_p8(poly8x16_t a
, poly8x16_t b
)
3626 __m128i c1
, res
, tmp
, bmasked
;
3628 c1
= _mm_cmpeq_epi8 (a
,a
); //all ones 0xff....
3629 c1
= vshrq_n_u8(c1
,7); //0x1
3630 bmasked
= _mm_and_si128(b
, c1
); //0x1
3631 res
= vmulq_u8(a
, bmasked
);
3632 for(i
= 1; i
<8; i
++) {
3633 c1
= _mm_slli_epi16(c1
,1); //shift mask left by 1, 16 bit shift is OK here
3634 bmasked
= _mm_and_si128(b
, c1
); //0x1
3635 tmp
= vmulq_u8(a
, bmasked
);
3636 res
= _mm_xor_si128(res
, tmp
);
3641 //************************* Vector long multiply ***********************************
3642 //****************************************************************************
3643 int16x8_t
vmull_s8(int8x8_t a
, int8x8_t b
); // VMULL.S8 q0,d0,d0
3644 _NEON2SSE_INLINE int16x8_t
vmull_s8(int8x8_t a
, int8x8_t b
) // VMULL.S8 q0,d0,d0
3646 //no 8 bit simd multiply, need to go to 16 bits
3648 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); // SSE 4.1
3649 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); // SSE 4.1
3650 return _mm_mullo_epi16 (a16
, b16
); //should fit into 16 bit
3653 int32x4_t
vmull_s16(int16x4_t a
, int16x4_t b
); // VMULL.S16 q0,d0,d0
3654 _NEON2SSE_INLINE int32x4_t
vmull_s16(int16x4_t a
, int16x4_t b
) // VMULL.S16 q0,d0,d0
3658 a16
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); // SSE 4.1
3659 b16
= _MM_CVTEPI16_EPI32 (_pM128i(b
)); // SSE 4.1
3660 return _MM_MULLO_EPI32 (a16
, b16
); // SSE 4.1
3662 __m128i low
, hi
, a128
,b128
;
3665 low
= _mm_mullo_epi16(a128
,b128
);
3666 hi
= _mm_mulhi_epi16(a128
,b128
);
3667 return _mm_unpacklo_epi16(low
,hi
);
3671 int64x2_t
vmull_s32(int32x2_t a
, int32x2_t b
); // VMULL.S32 q0,d0,d0
3672 _NEON2SSE_INLINE int64x2_t
vmull_s32(int32x2_t a
, int32x2_t b
) // VMULL.S32 q0,d0,d0
3674 __m128i ab
, ba
, a128
, b128
;
3677 ab
= _mm_unpacklo_epi32 (a128
, b128
); //a0, b0, a1,b1
3678 ba
= _mm_unpacklo_epi32 (b128
, a128
); //b0, a0, b1,a1
3679 return _MM_MUL_EPI32(ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3682 uint16x8_t
vmull_u8(uint8x8_t a
, uint8x8_t b
); // VMULL.U8 q0,d0,d0
3683 _NEON2SSE_INLINE uint16x8_t
vmull_u8(uint8x8_t a
, uint8x8_t b
) // VMULL.U8 q0,d0,d0
3685 //no 8 bit simd multiply, need to go to 16 bits
3687 a16
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); // SSE 4.1
3688 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); // SSE 4.1
3689 return _mm_mullo_epi16 (a16
, b16
); //should fit into 16 bit
3692 uint32x4_t
vmull_u16(uint16x4_t a
, uint16x4_t b
); // VMULL.s16 q0,d0,d0
3693 _NEON2SSE_INLINE uint32x4_t
vmull_u16(uint16x4_t a
, uint16x4_t b
) // VMULL.s16 q0,d0,d0
3697 a16
= _MM_CVTEPU16_EPI32 (_pM128i(a
)); // SSE 4.1
3698 b16
= _MM_CVTEPU16_EPI32 (_pM128i(b
)); // SSE 4.1
3699 return _MM_MULLO_EPI32 (a16
, b16
); // SSE 4.1
3701 __m128i a128
,b128
,low
, hi
;
3704 low
= _mm_mullo_epi16(a128
,b128
);
3705 hi
= _mm_mulhi_epu16(a128
,b128
);
3706 return _mm_unpacklo_epi16(low
,hi
);
3710 uint64x2_t
vmull_u32(uint32x2_t a
, uint32x2_t b
); // VMULL.U32 q0,d0,d0
3711 _NEON2SSE_INLINE uint64x2_t
vmull_u32(uint32x2_t a
, uint32x2_t b
) // VMULL.U32 q0,d0,d0
3713 ///may be not optimal compared with serial implementation
3714 __m128i ab
, ba
, a128
, b128
;
3717 ab
= _mm_unpacklo_epi32 (a128
, b128
); //a0, b0, a1,b1
3718 ba
= _mm_unpacklo_epi32 (b128
, a128
); //b0, a0, b1,a1
3719 return _mm_mul_epu32 (ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
3722 poly16x8_t
vmull_p8(poly8x8_t a
, poly8x8_t b
); // VMULL.P8 q0,d0,d0
3723 _NEON2SSE_INLINE poly16x8_t
vmull_p8(poly8x8_t a
, poly8x8_t b
)
3726 __m128i a128
,b128
, c1
, a128_16
, bmasked_16
, res
, tmp
, bmasked
;
3730 c1
= _mm_cmpeq_epi8 (a128
,a128
); //all ones 0xff....
3731 c1
= vshrq_n_u8(c1
,7); //0x1
3732 bmasked
= _mm_and_si128(b128
, c1
); //0x1
3734 a128_16
= _MM_CVTEPU8_EPI16 (a128
); // SSE 4.1
3735 bmasked_16
= _MM_CVTEPU8_EPI16 (bmasked
); // SSE 4.1
3736 res
= _mm_mullo_epi16 (a128_16
, bmasked_16
); //should fit into 16 bit
3737 for(i
= 1; i
<8; i
++) {
3738 c1
= _mm_slli_epi16(c1
,1); //shift mask left by 1, 16 bit shift is OK here
3739 bmasked
= _mm_and_si128(b128
, c1
); //0x1
3740 bmasked_16
= _MM_CVTEPU8_EPI16 (bmasked
); // SSE 4.1
3741 tmp
= _mm_mullo_epi16 (a128_16
, bmasked_16
); //should fit into 16 bit, vmull_u8(a, bmasked);
3742 res
= _mm_xor_si128(res
, tmp
);
3747 //****************Vector saturating doubling long multiply **************************
3748 //*****************************************************************
3749 int32x4_t
vqdmull_s16(int16x4_t a
, int16x4_t b
); // VQDMULL.S16 q0,d0,d0
3750 _NEON2SSE_INLINE int32x4_t
vqdmull_s16(int16x4_t a
, int16x4_t b
)
3752 //the serial soulution may be faster due to saturation
3754 res
= vmull_s16(a
, b
);
3755 return vqd_s32(res
);
3758 int64x2_t
vqdmull_s32(int32x2_t a
, int32x2_t b
); // VQDMULL.S32 q0,d0,d0
3759 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmull_s32(int32x2_t a
, int32x2_t b
),_NEON2SSE_REASON_SLOW_SERIAL
)
3761 //the serial soulution may be faster due to saturation
3763 res
= vmull_s32(a
,b
);
3764 return vqaddq_s64(res
,res
); //slow serial function!!!!
3767 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************
3768 //******************************************************************************************
3769 int8x8_t
vmla_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VMLA.I8 d0,d0,d0
3770 _NEON2SSE_INLINE int8x8_t
vmla_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
) // VMLA.I8 d0,d0,d0
3772 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3774 __m128i b128
, c128
, res
;
3775 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3776 b128
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); // SSE 4.1 use low 64 bits
3777 c128
= _MM_CVTEPI8_EPI16 (_pM128i(c
)); // SSE 4.1 use low 64 bits
3778 res
= _mm_mullo_epi16 (c128
, b128
);
3779 res
= _mm_shuffle_epi8 (res
, *(__m128i
*) mask8_16_even_odd
);
3780 res
= _mm_add_epi8 (res
, _pM128i(a
)); //use the low 64 bits
3784 int16x4_t
vmla_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VMLA.I16 d0,d0,d0
3785 _NEON2SSE_INLINE int16x4_t
vmla_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
)
3788 return64(vmlaq_s16(_pM128i(a
),_pM128i(b
), _pM128i(c
)));
3792 int32x2_t
vmla_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VMLA.I32 d0,d0,d0
3793 _NEON2SSE_INLINE int32x2_t
vmla_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
) // VMLA.I32 d0,d0,d0
3797 res
= _MM_MULLO_EPI32 (_pM128i(b
), _pM128i(c
)); //SSE4.1
3798 res
= _mm_add_epi32 (res
, _pM128i(a
)); //use the low 64 bits
3802 float32x2_t
vmla_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
); // VMLA.F32 d0,d0,d0
3803 _NEON2SSE_INLINE float32x2_t
vmla_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
)
3805 //fma is coming soon, but right now:
3808 res
= _mm_mul_ps (_pM128(c
), _pM128(b
));
3809 res
= _mm_add_ps (_pM128(a
), res
);
3814 uint8x8_t
vmla_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLA.I8 d0,d0,d0
3815 _NEON2SSE_INLINE uint8x8_t
vmla_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
) // VMLA.I8 d0,d0,d0
3817 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits
3819 __m128i mask
, b128
, c128
, res
;
3820 mask
= _mm_set1_epi16(0xff);
3821 b128
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); // SSE 4.1 use low 64 bits
3822 c128
= _MM_CVTEPU8_EPI16 (_pM128i(c
)); // SSE 4.1 use low 64 bits
3823 res
= _mm_mullo_epi16 (c128
, b128
);
3824 res
= _mm_and_si128(res
, mask
); //to avoid saturation
3825 res
= _mm_packus_epi16 (res
, res
);
3826 res
= _mm_add_epi8 (res
, _pM128i(a
)); //use the low 64 bits
3830 uint16x4_t
vmla_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLA.I16 d0,d0,d0
3831 #define vmla_u16 vmla_s16
3833 uint32x2_t
vmla_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLA.I32 d0,d0,d0
3834 #define vmla_u32 vmla_s32
3836 int8x16_t
vmlaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VMLA.I8 q0,q0,q0
3837 _NEON2SSE_INLINE int8x16_t
vmlaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
) // VMLA.I8 q0,q0,q0
3839 //solution may be not optimal
3840 // no 8 bit simd multiply, need to go to 16 bits
3841 __m128i b16
, c16
, r16_1
, a_2
,r16_2
;
3842 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3843 b16
= _MM_CVTEPI8_EPI16 (b
); // SSE 4.1
3844 c16
= _MM_CVTEPI8_EPI16 (c
); // SSE 4.1
3845 r16_1
= _mm_mullo_epi16 (b16
, c16
);
3846 r16_1
= _mm_shuffle_epi8 (r16_1
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bits
3847 r16_1
= _mm_add_epi8 (r16_1
, a
);
3848 //swap hi and low part of a, b and c to process the remaining data
3849 a_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
3850 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
3851 c16
= _mm_shuffle_epi32 (c
, _SWAP_HI_LOW32
);
3852 b16
= _MM_CVTEPI8_EPI16 (b16
); // SSE 4.1
3853 c16
= _MM_CVTEPI8_EPI16 (c16
); // SSE 4.1
3855 r16_2
= _mm_mullo_epi16 (b16
, c16
);
3856 r16_2
= _mm_shuffle_epi8 (r16_2
, *(__m128i
*) mask8_16_even_odd
);
3857 r16_2
= _mm_add_epi8(r16_2
, a_2
);
3858 return _mm_unpacklo_epi64(r16_1
,r16_2
);
3861 int16x8_t
vmlaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VMLA.I16 q0,q0,q0
3862 _NEON2SSE_INLINE int16x8_t
vmlaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
) // VMLA.I16 q0,q0,q0
3865 res
= _mm_mullo_epi16 (c
, b
);
3866 return _mm_add_epi16 (res
, a
);
3869 int32x4_t
vmlaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VMLA.I32 q0,q0,q0
3870 _NEON2SSE_INLINE int32x4_t
vmlaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
) // VMLA.I32 q0,q0,q0
3873 res
= _MM_MULLO_EPI32 (c
, b
); //SSE4.1
3874 return _mm_add_epi32 (res
, a
);
3877 float32x4_t
vmlaq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
); // VMLA.F32 q0,q0,q0
3878 _NEON2SSE_INLINE float32x4_t
vmlaq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
) // VMLA.F32 q0,q0,q0
3880 //fma is coming soon, but right now:
3882 res
= _mm_mul_ps (c
, b
);
3883 return _mm_add_ps (a
, res
);
3886 uint8x16_t
vmlaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VMLA.I8 q0,q0,q0
3887 _NEON2SSE_INLINE uint8x16_t
vmlaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
) // VMLA.I8 q0,q0,q0
3889 //solution may be not optimal
3890 // no 8 bit simd multiply, need to go to 16 bits
3891 __m128i b16
, c16
, r16_1
, a_2
, r16_2
;
3892 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
3893 b16
= _MM_CVTEPU8_EPI16 (b
); // SSE 4.1
3894 c16
= _MM_CVTEPU8_EPI16 (c
); // SSE 4.1
3895 r16_1
= _mm_mullo_epi16 (b16
, c16
);
3896 r16_1
= _mm_shuffle_epi8 (r16_1
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bits
3897 r16_1
= _mm_add_epi8 (r16_1
, a
);
3898 //swap hi and low part of a, b and c to process the remaining data
3899 a_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
3900 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
3901 c16
= _mm_shuffle_epi32 (c
, _SWAP_HI_LOW32
);
3902 b16
= _MM_CVTEPU8_EPI16 (b16
); // SSE 4.1
3903 c16
= _MM_CVTEPU8_EPI16 (c16
); // SSE 4.1
3905 r16_2
= _mm_mullo_epi16 (b16
, c16
);
3906 r16_2
= _mm_shuffle_epi8 (r16_2
, *(__m128i
*) mask8_16_even_odd
);
3907 r16_2
= _mm_add_epi8(r16_2
, a_2
);
3908 return _mm_unpacklo_epi64(r16_1
,r16_2
);
3911 uint16x8_t
vmlaq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VMLA.I16 q0,q0,q0
3912 #define vmlaq_u16 vmlaq_s16
3914 uint32x4_t
vmlaq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VMLA.I32 q0,q0,q0
3915 #define vmlaq_u32 vmlaq_s32
3917 //********************** Vector widening multiply accumulate (long multiply accumulate):
3918 // vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] **************
3919 //********************************************************************************************
3920 int16x8_t
vmlal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VMLAL.S8 q0,d0,d0
3921 _NEON2SSE_INLINE int16x8_t
vmlal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
) // VMLAL.S8 q0,d0,d0
3924 res
= vmull_s8(b
, c
);
3925 return _mm_add_epi16 (res
, a
);
3928 int32x4_t
vmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VMLAL.S16 q0,d0,d0
3929 _NEON2SSE_INLINE int32x4_t
vmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
) // VMLAL.S16 q0,d0,d0
3931 //may be not optimal compared with serial implementation
3933 res
= vmull_s16(b
, c
);
3934 return _mm_add_epi32 (res
, a
);
3937 int64x2_t
vmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VMLAL.S32 q0,d0,d0
3938 _NEON2SSE_INLINE int64x2_t
vmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
) // VMLAL.S32 q0,d0,d0
3940 //may be not optimal compared with serial implementation
3942 res
= vmull_s32( b
, c
);
3943 return _mm_add_epi64 (res
, a
);
3946 uint16x8_t
vmlal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLAL.U8 q0,d0,d0
3947 _NEON2SSE_INLINE uint16x8_t
vmlal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
) // VMLAL.U8 q0,d0,d0
3950 res
= vmull_u8(b
, c
);
3951 return _mm_add_epi16 (res
, a
);
3954 uint32x4_t
vmlal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLAL.s16 q0,d0,d0
3955 _NEON2SSE_INLINE uint32x4_t
vmlal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
) // VMLAL.s16 q0,d0,d0
3957 //may be not optimal compared with serial implementation
3959 res
= vmull_u16(b
, c
);
3960 return _mm_add_epi32 (res
, a
);
3963 uint64x2_t
vmlal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLAL.U32 q0,d0,d0
3964 _NEON2SSE_INLINE uint64x2_t
vmlal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
) // VMLAL.U32 q0,d0,d0
3966 //may be not optimal compared with serial implementation
3968 res
= vmull_u32( b
,c
);
3969 return _mm_add_epi64 (res
, a
);
3972 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
3973 //********************************************************************************************
3974 int8x8_t
vmls_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VMLS.I8 d0,d0,d0
3975 _NEON2SSE_INLINE int8x8_t
vmls_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
) // VMLS.I8 d0,d0,d0
3977 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
3980 res64
= vmul_s8(b
,c
);
3981 res
= _mm_sub_epi8 (_pM128i(a
), _pM128i(res64
));
3985 int16x4_t
vmls_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VMLS.I16 d0,d0,d0
3986 _NEON2SSE_INLINE int16x4_t
vmls_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
)
3989 return64(vmlsq_s16(_pM128i(a
),_pM128i(b
), _pM128i(c
)));
3993 int32x2_t
vmls_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VMLS.I32 d0,d0,d0
3994 _NEON2SSE_INLINE int32x2_t
vmls_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
) // VMLS.I32 d0,d0,d0
3998 res
= _MM_MULLO_EPI32 (_pM128i(c
),_pM128i( b
)); //SSE4.1
3999 res
= _mm_sub_epi32 (_pM128i(a
),res
); //use low 64 bits only
4003 float32x2_t
vmls_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
); // VMLS.F32 d0,d0,d0
4004 _NEON2SSE_INLINE float32x2_t
vmls_f32(float32x2_t a
, float32x2_t b
, float32x2_t c
)
4008 res
= _mm_mul_ps (_pM128(c
), _pM128(b
));
4009 res
= _mm_sub_ps (_pM128(a
), res
);
4014 uint8x8_t
vmls_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLS.I8 d0,d0,d0
4015 _NEON2SSE_INLINE uint8x8_t
vmls_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
)
4017 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits
4020 res64
= vmul_u8(b
,c
);
4021 res
= _mm_sub_epi8 (_pM128i(a
), _pM128i(res64
));
4025 uint16x4_t
vmls_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLS.I16 d0,d0,d0
4026 #define vmls_u16 vmls_s16
4028 uint32x2_t
vmls_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLS.I32 d0,d0,d0
4029 #define vmls_u32 vmls_s32
4032 int8x16_t
vmlsq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VMLS.I8 q0,q0,q0
4033 _NEON2SSE_INLINE int8x16_t
vmlsq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
) // VMLS.I8 q0,q0,q0
4035 //solution may be not optimal
4036 // no 8 bit simd multiply, need to go to 16 bits
4037 __m128i b16
, c16
, r16_1
, a_2
, r16_2
;
4038 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
4039 b16
= _MM_CVTEPI8_EPI16 (b
); // SSE 4.1
4040 c16
= _MM_CVTEPI8_EPI16 (c
); // SSE 4.1
4041 r16_1
= _mm_mullo_epi16 (b16
, c16
);
4042 r16_1
= _mm_shuffle_epi8 (r16_1
, *(__m128i
*) mask8_16_even_odd
);
4043 r16_1
= _mm_sub_epi8 (a
, r16_1
);
4044 //swap hi and low part of a, b, c to process the remaining data
4045 a_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
4046 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
4047 c16
= _mm_shuffle_epi32 (c
, _SWAP_HI_LOW32
);
4048 b16
= _MM_CVTEPI8_EPI16 (b16
); // SSE 4.1
4049 c16
= _MM_CVTEPI8_EPI16 (c16
); // SSE 4.1
4051 r16_2
= _mm_mullo_epi16 (b16
, c16
);
4052 r16_2
= _mm_shuffle_epi8 (r16_2
, *(__m128i
*) mask8_16_even_odd
);
4053 r16_2
= _mm_sub_epi8 (a_2
, r16_2
);
4054 return _mm_unpacklo_epi64(r16_1
,r16_2
);
4057 int16x8_t
vmlsq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VMLS.I16 q0,q0,q0
4058 _NEON2SSE_INLINE int16x8_t
vmlsq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
) // VMLS.I16 q0,q0,q0
4061 res
= _mm_mullo_epi16 (c
, b
);
4062 return _mm_sub_epi16 (a
, res
);
4065 int32x4_t
vmlsq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VMLS.I32 q0,q0,q0
4066 _NEON2SSE_INLINE int32x4_t
vmlsq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
) // VMLS.I32 q0,q0,q0
4069 res
= _MM_MULLO_EPI32 (c
, b
); //SSE4.1
4070 return _mm_sub_epi32 (a
, res
);
4073 float32x4_t
vmlsq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
); // VMLS.F32 q0,q0,q0
4074 _NEON2SSE_INLINE float32x4_t
vmlsq_f32(float32x4_t a
, float32x4_t b
, float32x4_t c
) // VMLS.F32 q0,q0,q0
4077 res
= _mm_mul_ps (c
, b
);
4078 return _mm_sub_ps (a
, res
);
4081 uint8x16_t
vmlsq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VMLS.I8 q0,q0,q0
4082 _NEON2SSE_INLINE uint8x16_t
vmlsq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
) // VMLS.I8 q0,q0,q0
4084 //solution may be not optimal
4085 // no 8 bit simd multiply, need to go to 16 bits
4086 __m128i b16
, c16
, r16_1
, a_2
, r16_2
;
4087 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
4088 b16
= _MM_CVTEPU8_EPI16 (b
); // SSE 4.1
4089 c16
= _MM_CVTEPU8_EPI16 (c
); // SSE 4.1
4090 r16_1
= _mm_mullo_epi16 (b16
, c16
);
4091 r16_1
= _mm_shuffle_epi8 (r16_1
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bits
4092 r16_1
= _mm_sub_epi8 (a
, r16_1
);
4093 //swap hi and low part of a, b and c to process the remaining data
4094 a_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
4095 b16
= _mm_shuffle_epi32 (b
, _SWAP_HI_LOW32
);
4096 c16
= _mm_shuffle_epi32 (c
, _SWAP_HI_LOW32
);
4097 b16
= _MM_CVTEPU8_EPI16 (b16
); // SSE 4.1
4098 c16
= _MM_CVTEPU8_EPI16 (c16
); // SSE 4.1
4100 r16_2
= _mm_mullo_epi16 (b16
, c16
);
4101 r16_2
= _mm_shuffle_epi8 (r16_2
, *(__m128i
*) mask8_16_even_odd
);
4102 r16_2
= _mm_sub_epi8(a_2
, r16_2
);
4103 return _mm_unpacklo_epi64(r16_1
,r16_2
);
4106 uint16x8_t
vmlsq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VMLS.I16 q0,q0,q0
4107 #define vmlsq_u16 vmlsq_s16
4109 uint32x4_t
vmlsq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VMLS.I32 q0,q0,q0
4110 #define vmlsq_u32 vmlsq_s32
4112 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
4113 //*************************************************************************************************************
4114 int16x8_t
vmlsl_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VMLSL.S8 q0,d0,d0
4115 _NEON2SSE_INLINE int16x8_t
vmlsl_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
) // VMLSL.S8 q0,d0,d0
4118 res
= vmull_s8(b
, c
);
4119 return _mm_sub_epi16 (a
, res
);
4122 int32x4_t
vmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VMLSL.S16 q0,d0,d0
4123 _NEON2SSE_INLINE int32x4_t
vmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
) // VMLSL.S16 q0,d0,d0
4125 //may be not optimal compared with serial implementation
4127 res
= vmull_s16(b
, c
);
4128 return _mm_sub_epi32 (a
, res
);
4131 int64x2_t
vmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VMLSL.S32 q0,d0,d0
4132 _NEON2SSE_INLINE int64x2_t
vmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
) // VMLSL.S32 q0,d0,d0
4134 //may be not optimal compared with serial implementation
4136 res
= vmull_s32( b
,c
);
4137 return _mm_sub_epi64 (a
, res
);
4140 uint16x8_t
vmlsl_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VMLSL.U8 q0,d0,d0
4141 _NEON2SSE_INLINE uint16x8_t
vmlsl_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
) // VMLSL.U8 q0,d0,d0
4144 res
= vmull_u8(b
, c
);
4145 return _mm_sub_epi16 (a
, res
);
4148 uint32x4_t
vmlsl_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VMLSL.s16 q0,d0,d0
4149 _NEON2SSE_INLINE uint32x4_t
vmlsl_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
) // VMLSL.s16 q0,d0,d0
4151 //may be not optimal compared with serial implementation
4153 res
= vmull_u16(b
, c
);
4154 return _mm_sub_epi32 (a
, res
);
4157 uint64x2_t
vmlsl_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VMLSL.U32 q0,d0,d0
4158 _NEON2SSE_INLINE uint64x2_t
vmlsl_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
) // VMLSL.U32 q0,d0,d0
4160 //may be not optimal compared with serial implementation
4162 res
= vmull_u32( b
,c
);
4163 return _mm_sub_epi64 (a
, res
);
4166 //****** Vector saturating doubling multiply high **********************
4167 //*************************************************************************
4168 int16x4_t
vqdmulh_s16(int16x4_t a
, int16x4_t b
); // VQDMULH.S16 d0,d0,d0
4169 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vqdmulh_s16(int16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
4172 int32_t a32
, b32
, i
;
4173 for (i
= 0; i
<4; i
++) {
4174 a32
= (int32_t) a
.m64_i16
[i
];
4175 b32
= (int32_t) b
.m64_i16
[i
];
4176 a32
= (a32
* b32
) >> 15;
4177 res
.m64_i16
[i
] = (a32
== 0x8000) ? 0x7fff : (int16_t) a32
;
4182 int32x2_t
vqdmulh_s32(int32x2_t a
, int32x2_t b
); // VQDMULH.S32 d0,d0,d0
4183 _NEON2SSE_INLINE int32x2_t
vqdmulh_s32(int32x2_t a
, int32x2_t b
) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
4185 //may be not optimal compared with a serial solution
4188 _NEON2SSE_ALIGN_16
uint32_t cmask32
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4190 mul
= vmull_s32(a
,b
);
4191 mul
= _mm_slli_epi64(mul
,1); //double the result
4192 //at this point start treating 2 64-bit numbers as 4 32-bit
4193 mul
= _mm_shuffle_epi32 (mul
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4194 mask
= _mm_cmpeq_epi32 (mul
, *(__m128i
*)cmask32
);
4195 mul
= _mm_xor_si128 (mul
, mask
); //res saturated for 0x80000000
4199 int16x8_t
vqdmulhq_s16(int16x8_t a
, int16x8_t b
); // VQDMULH.S16 q0,q0,q0
4200 _NEON2SSE_INLINE int16x8_t
vqdmulhq_s16(int16x8_t a
, int16x8_t b
) // VQDMULH.S16 q0,q0,q0
4202 __m128i res
, res_lo
, mask
;
4203 _NEON2SSE_ALIGN_16
uint16_t cmask
[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4204 res
= _mm_mulhi_epi16 (a
, b
);
4205 res
= _mm_slli_epi16 (res
, 1); //double the result, don't care about saturation
4206 res_lo
= _mm_mullo_epi16 (a
, b
);
4207 res_lo
= _mm_srli_epi16(res_lo
,15); //take the highest bit
4208 res
= _mm_add_epi16(res
, res_lo
); //combine results
4209 mask
= _mm_cmpeq_epi16 (res
, *(__m128i
*)cmask
);
4210 return _mm_xor_si128 (res
, mask
); //res saturated for 0x8000
4213 int32x4_t
vqdmulhq_s32(int32x4_t a
, int32x4_t b
); // VQDMULH.S32 q0,q0,q0
4214 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqdmulhq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
4216 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4217 __m128i ab
, ba
, mask
, mul
, mul1
;
4218 _NEON2SSE_ALIGN_16
uint32_t cmask32
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4219 ab
= _mm_unpacklo_epi32 (a
, b
); //a0, b0, a1,b1
4220 ba
= _mm_unpacklo_epi32 (b
, a
); //b0, a0, b1,a1
4221 mul
= _MM_MUL_EPI32(ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4222 mul
= _mm_slli_epi64(mul
,1); //double the result
4223 ab
= _mm_unpackhi_epi32 (a
, b
); //a2, b2, a3,b3
4224 ba
= _mm_unpackhi_epi32 (b
, a
); //b2, a2, b3,a3
4225 mul1
= _MM_MUL_EPI32(ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4226 mul1
= _mm_slli_epi64(mul1
,1); //double the result
4227 mul
= _mm_shuffle_epi32 (mul
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4228 mul1
= _mm_shuffle_epi32 (mul1
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
4229 mul
= _mm_unpacklo_epi64(mul
, mul1
);
4230 mask
= _mm_cmpeq_epi32 (mul
, *(__m128i
*)cmask32
);
4231 return _mm_xor_si128 (mul
, mask
); //res saturated for 0x80000000
4234 //********* Vector saturating rounding doubling multiply high ****************
4235 //****************************************************************************
4236 //If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order
4237 int16x4_t
vqrdmulh_s16(int16x4_t a
, int16x4_t b
); // VQRDMULH.S16 d0,d0,d0
4238 _NEON2SSE_INLINE int16x4_t
vqrdmulh_s16(int16x4_t a
, int16x4_t b
)
4241 return64(vqrdmulhq_s16(_pM128i(a
), _pM128i(b
)));
4244 int32x2_t
vqrdmulh_s32(int32x2_t a
, int32x2_t b
); // VQRDMULH.S32 d0,d0,d0
4245 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqrdmulh_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
4247 //may be not optimal compared with a serial solution
4249 _NEON2SSE_ALIGN_16
uint32_t cmask32
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4250 __m128i res_sat
, mask
, mask1
;
4252 mul
= vmull_s32(a
,b
);
4253 res_sat
= _mm_slli_epi64 (mul
, 1); //double the result, saturation not considered
4254 mask1
= _mm_slli_epi64(res_sat
, 32); //shift left then back right to
4255 mask1
= _mm_srli_epi64(mask1
,31); //get 31-th bit 1 or zero
4256 mul
= _mm_add_epi32 (res_sat
, mask1
); //actual rounding
4257 //at this point start treating 2 64-bit numbers as 4 32-bit
4258 mul
= _mm_shuffle_epi32 (mul
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4259 mask
= _mm_cmpeq_epi32 (mul
, *(__m128i
*)cmask32
);
4260 mul
= _mm_xor_si128 (mul
, mask
); //res saturated for 0x80000000
4264 int16x8_t
vqrdmulhq_s16(int16x8_t a
, int16x8_t b
); // VQRDMULH.S16 q0,q0,q0
4265 _NEON2SSE_INLINE int16x8_t
vqrdmulhq_s16(int16x8_t a
, int16x8_t b
) // VQRDMULH.S16 q0,q0,q0
4268 _NEON2SSE_ALIGN_16
uint16_t cmask
[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
4269 res
= _mm_mulhrs_epi16 (a
, b
);
4270 mask
= _mm_cmpeq_epi16 (res
, *(__m128i
*)cmask
);
4271 return _mm_xor_si128 (res
, mask
); //res saturated for 0x8000
4274 int32x4_t
vqrdmulhq_s32(int32x4_t a
, int32x4_t b
); // VQRDMULH.S32 q0,q0,q0
4275 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqrdmulhq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
4277 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
4278 __m128i ab
, ba
, mask
, mul
, mul1
, mask1
;
4279 _NEON2SSE_ALIGN_16
uint32_t cmask32
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4280 ab
= _mm_unpacklo_epi32 (a
, b
); //a0, b0, a1,b1
4281 ba
= _mm_unpacklo_epi32 (b
, a
); //b0, a0, b1,a1
4282 mul
= _MM_MUL_EPI32(ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4283 mul
= _mm_slli_epi64 (mul
, 1); //double the result, saturation not considered
4284 mask1
= _mm_slli_epi64(mul
, 32); //shift left then back right to
4285 mask1
= _mm_srli_epi64(mask1
,31); //get 31-th bit 1 or zero
4286 mul
= _mm_add_epi32 (mul
, mask1
); //actual rounding
4288 ab
= _mm_unpackhi_epi32 (a
, b
); //a2, b2, a3,b3
4289 ba
= _mm_unpackhi_epi32 (b
, a
); //b2, a2, b3,a3
4290 mul1
= _MM_MUL_EPI32(ab
, ba
); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
4291 mul1
= _mm_slli_epi64 (mul1
, 1); //double the result, saturation not considered
4292 mask1
= _mm_slli_epi64(mul1
, 32); //shift left then back right to
4293 mask1
= _mm_srli_epi64(mask1
,31); //get 31-th bit 1 or zero
4294 mul1
= _mm_add_epi32 (mul1
, mask1
); //actual rounding
4295 //at this point start treating 2 64-bit numbers as 4 32-bit
4296 mul
= _mm_shuffle_epi32 (mul
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4297 mul1
= _mm_shuffle_epi32 (mul1
, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
4298 mul
= _mm_unpacklo_epi64(mul
, mul1
);
4299 mask
= _mm_cmpeq_epi32 (mul
, *(__m128i
*)cmask32
);
4300 return _mm_xor_si128 (mul
, mask
); //res saturated for 0x80000000
4303 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
4304 //*************************************************************************************************************************
4305 int32x4_t
vqdmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VQDMLAL.S16 q0,d0,d0
4306 _NEON2SSE_INLINE int32x4_t
vqdmlal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
) // VQDMLAL.S16 q0,d0,d0
4308 //not optimal SIMD soulution, serial may be faster
4310 res32
= vmull_s16(b
, c
);
4311 res32
= vqd_s32(res32
); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
4312 return vqaddq_s32(res32
, a
); //saturation
4315 int64x2_t
vqdmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VQDMLAL.S32 q0,d0,d0
4316 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmlal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
),_NEON2SSE_REASON_SLOW_SERIAL
)
4319 res64
= vmull_s32(b
,c
);
4320 res64
= vqaddq_s64(res64
, res64
); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
4321 return vqaddq_s64(res64
, a
); //saturation
4324 //************************************************************************************
4325 //****************** Vector subtract ***********************************************
4326 //************************************************************************************
4327 int8x8_t
vsub_s8(int8x8_t a
, int8x8_t b
); // VSUB.I8 d0,d0,d0
4328 _NEON2SSE_INLINE int8x8_t
vsub_s8(int8x8_t a
, int8x8_t b
)
4331 return64(_mm_sub_epi8(_pM128i(a
),_pM128i(b
)));
4335 int16x4_t
vsub_s16(int16x4_t a
, int16x4_t b
); // VSUB.I16 d0,d0,d0
4336 _NEON2SSE_INLINE int16x4_t
vsub_s16(int16x4_t a
, int16x4_t b
)
4339 return64(_mm_sub_epi16(_pM128i(a
),_pM128i(b
)));
4343 int32x2_t
vsub_s32(int32x2_t a
, int32x2_t b
); // VSUB.I32 d0,d0,d0
4344 _NEON2SSE_INLINE int32x2_t
vsub_s32(int32x2_t a
, int32x2_t b
)
4347 return64(_mm_sub_epi32(_pM128i(a
),_pM128i(b
)));
4351 int64x1_t
vsub_s64(int64x1_t a
, int64x1_t b
); // VSUB.I64 d0,d0,d0
4352 _NEON2SSE_INLINE int64x1_t
vsub_s64(int64x1_t a
, int64x1_t b
)
4355 res64
.m64_i64
[0] = a
.m64_i64
[0] - b
.m64_i64
[0];
4360 float32x2_t
vsub_f32(float32x2_t a
, float32x2_t b
); // VSUB.F32 d0,d0,d0
4361 _NEON2SSE_INLINE float32x2_t
vsub_f32(float32x2_t a
, float32x2_t b
)
4364 res
.m64_f32
[0] = a
.m64_f32
[0] - b
.m64_f32
[0];
4365 res
.m64_f32
[1] = a
.m64_f32
[1] - b
.m64_f32
[1];
4369 uint8x8_t
vsub_u8(uint8x8_t a
, uint8x8_t b
); // VSUB.I8 d0,d0,d0
4370 #define vsub_u8 vsub_s8
4372 uint16x4_t
vsub_u16(uint16x4_t a
, uint16x4_t b
); // VSUB.I16 d0,d0,d0
4373 #define vsub_u16 vsub_s16
4375 uint32x2_t
vsub_u32(uint32x2_t a
, uint32x2_t b
); // VSUB.I32 d0,d0,d0
4376 #define vsub_u32 vsub_s32
4379 uint64x1_t
vsub_u64(uint64x1_t a
, uint64x1_t b
); // VSUB.I64 d0,d0,d0
4380 _NEON2SSE_INLINE uint64x1_t
vsub_u64(uint64x1_t a
, uint64x1_t b
)
4383 res64
.m64_u64
[0] = a
.m64_u64
[0] - b
.m64_u64
[0];
4388 int8x16_t
vsubq_s8(int8x16_t a
, int8x16_t b
); // VSUB.I8 q0,q0,q0
4389 #define vsubq_s8 _mm_sub_epi8
4391 int16x8_t
vsubq_s16(int16x8_t a
, int16x8_t b
); // VSUB.I16 q0,q0,q0
4392 #define vsubq_s16 _mm_sub_epi16
4394 int32x4_t
vsubq_s32(int32x4_t a
, int32x4_t b
); // VSUB.I32 q0,q0,q0
4395 #define vsubq_s32 _mm_sub_epi32
4397 int64x2_t
vsubq_s64(int64x2_t a
, int64x2_t b
); // VSUB.I64 q0,q0,q0
4398 #define vsubq_s64 _mm_sub_epi64
4400 float32x4_t
vsubq_f32(float32x4_t a
, float32x4_t b
); // VSUB.F32 q0,q0,q0
4401 #define vsubq_f32 _mm_sub_ps
4403 uint8x16_t
vsubq_u8(uint8x16_t a
, uint8x16_t b
); // VSUB.I8 q0,q0,q0
4404 #define vsubq_u8 _mm_sub_epi8
4406 uint16x8_t
vsubq_u16(uint16x8_t a
, uint16x8_t b
); // VSUB.I16 q0,q0,q0
4407 #define vsubq_u16 _mm_sub_epi16
4409 uint32x4_t
vsubq_u32(uint32x4_t a
, uint32x4_t b
); // VSUB.I32 q0,q0,q0
4410 #define vsubq_u32 _mm_sub_epi32
4412 uint64x2_t
vsubq_u64(uint64x2_t a
, uint64x2_t b
); // VSUB.I64 q0,q0,q0
4413 #define vsubq_u64 _mm_sub_epi64
4415 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
4416 //***********************************************************************************
4417 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
4418 int16x8_t
vsubl_s8(int8x8_t a
, int8x8_t b
); // VSUBL.S8 q0,d0,d0
4419 _NEON2SSE_INLINE int16x8_t
vsubl_s8(int8x8_t a
, int8x8_t b
) // VSUBL.S8 q0,d0,d0
4422 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE4.1,
4423 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
4424 return _mm_sub_epi16 (a16
, b16
);
4427 int32x4_t
vsubl_s16(int16x4_t a
, int16x4_t b
); // VSUBL.S16 q0,d0,d0
4428 _NEON2SSE_INLINE int32x4_t
vsubl_s16(int16x4_t a
, int16x4_t b
) // VSUBL.S16 q0,d0,d0
4431 a32
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); //SSE4.1
4432 b32
= _MM_CVTEPI16_EPI32 (_pM128i(b
)); //SSE4.1,
4433 return _mm_sub_epi32 (a32
, b32
);
4436 int64x2_t
vsubl_s32(int32x2_t a
, int32x2_t b
); // VSUBL.S32 q0,d0,d0
4437 _NEON2SSE_INLINE int64x2_t
vsubl_s32(int32x2_t a
, int32x2_t b
) // VSUBL.S32 q0,d0,d0
4439 //may be not optimal
4441 a64
= _MM_CVTEPI32_EPI64 (_pM128i(a
)); //SSE4.1
4442 b64
= _MM_CVTEPI32_EPI64 (_pM128i(b
)); //SSE4.1,
4443 return _mm_sub_epi64 (a64
, b64
);
4446 uint16x8_t
vsubl_u8(uint8x8_t a
, uint8x8_t b
); // VSUBL.U8 q0,d0,d0
4447 _NEON2SSE_INLINE uint16x8_t
vsubl_u8(uint8x8_t a
, uint8x8_t b
) // VSUBL.U8 q0,d0,d0
4450 a16
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE4.1,
4451 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); //SSE4.1,
4452 return _mm_sub_epi16 (a16
, b16
);
4455 uint32x4_t
vsubl_u16(uint16x4_t a
, uint16x4_t b
); // VSUBL.s16 q0,d0,d0
4456 _NEON2SSE_INLINE uint32x4_t
vsubl_u16(uint16x4_t a
, uint16x4_t b
) // VSUBL.s16 q0,d0,d0
4459 a32
= _MM_CVTEPU16_EPI32 (_pM128i(a
)); //SSE4.1
4460 b32
= _MM_CVTEPU16_EPI32 (_pM128i(b
)); //SSE4.1,
4461 return _mm_sub_epi32 (a32
, b32
);
4464 uint64x2_t
vsubl_u32(uint32x2_t a
, uint32x2_t b
); // VSUBL.U32 q0,d0,d0
4465 _NEON2SSE_INLINE uint64x2_t
vsubl_u32(uint32x2_t a
, uint32x2_t b
) // VSUBL.U32 q0,d0,d0
4467 //may be not optimal
4469 a64
= _MM_CVTEPU32_EPI64 (_pM128i(a
)); //SSE4.1
4470 b64
= _MM_CVTEPU32_EPI64 (_pM128i(b
)); //SSE4.1,
4471 return _mm_sub_epi64 (a64
, b64
);
4474 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
4475 //*****************************************************************************************************
4476 int16x8_t
vsubw_s8(int16x8_t a
, int8x8_t b
); // VSUBW.S8 q0,q0,d0
4477 _NEON2SSE_INLINE int16x8_t
vsubw_s8(int16x8_t a
, int8x8_t b
) // VSUBW.S8 q0,q0,d0
4480 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
4481 return _mm_sub_epi16 (a
, b16
);
4484 int32x4_t
vsubw_s16(int32x4_t a
, int16x4_t b
); // VSUBW.S16 q0,q0,d0
4485 _NEON2SSE_INLINE int32x4_t
vsubw_s16(int32x4_t a
, int16x4_t b
) // VSUBW.S16 q0,q0,d0
4488 b32
= _MM_CVTEPI16_EPI32 (_pM128i(b
)); //SSE4.1,
4489 return _mm_sub_epi32 (a
, b32
);
4492 int64x2_t
vsubw_s32(int64x2_t a
, int32x2_t b
); // VSUBW.S32 q0,q0,d0
4493 _NEON2SSE_INLINE int64x2_t
vsubw_s32(int64x2_t a
, int32x2_t b
) // VSUBW.S32 q0,q0,d0
4496 b64
= _MM_CVTEPI32_EPI64 (_pM128i(b
)); //SSE4.1
4497 return _mm_sub_epi64 (a
, b64
);
4500 uint16x8_t
vsubw_u8(uint16x8_t a
, uint8x8_t b
); // VSUBW.U8 q0,q0,d0
4501 _NEON2SSE_INLINE uint16x8_t
vsubw_u8(uint16x8_t a
, uint8x8_t b
) // VSUBW.U8 q0,q0,d0
4504 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); //SSE4.1,
4505 return _mm_sub_epi16 (a
, b16
);
4508 uint32x4_t
vsubw_u16(uint32x4_t a
, uint16x4_t b
); // VSUBW.s16 q0,q0,d0
4509 _NEON2SSE_INLINE uint32x4_t
vsubw_u16(uint32x4_t a
, uint16x4_t b
) // VSUBW.s16 q0,q0,d0
4512 b32
= _MM_CVTEPU16_EPI32 (_pM128i(b
)); //SSE4.1,
4513 return _mm_sub_epi32 (a
, b32
);
4516 uint64x2_t
vsubw_u32(uint64x2_t a
, uint32x2_t b
); // VSUBW.U32 q0,q0,d0
4517 _NEON2SSE_INLINE uint64x2_t
vsubw_u32(uint64x2_t a
, uint32x2_t b
) // VSUBW.U32 q0,q0,d0
4520 b64
= _MM_CVTEPU32_EPI64 (_pM128i(b
)); //SSE4.1
4521 return _mm_sub_epi64 (a
, b64
);
4524 //************************Vector saturating subtract *********************************
4525 //*************************************************************************************
4526 int8x8_t
vqsub_s8(int8x8_t a
, int8x8_t b
); // VQSUB.S8 d0,d0,d0
4527 _NEON2SSE_INLINE int8x8_t
vqsub_s8(int8x8_t a
, int8x8_t b
)
4530 return64(_mm_subs_epi8(_pM128i(a
),_pM128i(b
)));
4534 int16x4_t
vqsub_s16(int16x4_t a
, int16x4_t b
); // VQSUB.S16 d0,d0,d0
4535 _NEON2SSE_INLINE int16x4_t
vqsub_s16(int16x4_t a
, int16x4_t b
)
4538 return64(_mm_subs_epi16(_pM128i(a
),_pM128i(b
)));
4542 int32x2_t
vqsub_s32(int32x2_t a
, int32x2_t b
); // VQSUB.S32 d0,d0,d0
4543 _NEON2SSE_INLINE int32x2_t
vqsub_s32(int32x2_t a
, int32x2_t b
)
4546 return64(vqsubq_s32(_pM128i(a
), _pM128i(b
)));
4550 int64x1_t
vqsub_s64(int64x1_t a
, int64x1_t b
); // VQSUB.S64 d0,d0,d0
4551 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vqsub_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
) //no optimal SIMD soulution
4557 res
.m64_u64
[0] = a64
- b64
;
4559 a64
= (a64
>> 63) + (~_SIGNBIT64
);
4560 if ((int64_t)((a64
^ b64
) & (a64
^ res
.m64_u64
[0])) < 0) {
4561 res
.m64_u64
[0] = a64
;
4566 uint8x8_t
vqsub_u8(uint8x8_t a
, uint8x8_t b
); // VQSUB.U8 d0,d0,d0
4567 _NEON2SSE_INLINE uint8x8_t
vqsub_u8(uint8x8_t a
, uint8x8_t b
)
4570 return64(_mm_subs_epu8(_pM128i(a
),_pM128i(b
)));
4574 uint16x4_t
vqsub_u16(uint16x4_t a
, uint16x4_t b
); // VQSUB.s16 d0,d0,d0
4575 _NEON2SSE_INLINE uint16x4_t
vqsub_u16(uint16x4_t a
, uint16x4_t b
)
4578 return64(_mm_subs_epu16(_pM128i(a
),_pM128i(b
)));
4582 uint32x2_t
vqsub_u32(uint32x2_t a
, uint32x2_t b
); // VQSUB.U32 d0,d0,d0
4583 _NEON2SSE_INLINE uint32x2_t
vqsub_u32(uint32x2_t a
, uint32x2_t b
)
4586 return64(vqsubq_u32(_pM128i(a
), _pM128i(b
)));
4590 uint64x1_t
vqsub_u64(uint64x1_t a
, uint64x1_t b
); // VQSUB.U64 d0,d0,d0
4591 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqsub_u64(uint64x1_t a
, uint64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
4598 res
.m64_u64
[0] = a64
- b64
;
4605 int8x16_t
vqsubq_s8(int8x16_t a
, int8x16_t b
); // VQSUB.S8 q0,q0,q0
4606 #define vqsubq_s8 _mm_subs_epi8
4608 int16x8_t
vqsubq_s16(int16x8_t a
, int16x8_t b
); // VQSUB.S16 q0,q0,q0
4609 #define vqsubq_s16 _mm_subs_epi16
4611 int32x4_t
vqsubq_s32(int32x4_t a
, int32x4_t b
); // VQSUB.S32 q0,q0,q0
4612 _NEON2SSE_INLINE int32x4_t
vqsubq_s32(int32x4_t a
, int32x4_t b
)
4614 //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
4615 __m128i c7fffffff
, res
, res_sat
, res_xor_a
, b_xor_a
;
4616 c7fffffff
= _mm_set1_epi32(0x7fffffff);
4617 res
= _mm_sub_epi32(a
, b
);
4618 res_sat
= _mm_srli_epi32(a
, 31);
4619 res_sat
= _mm_add_epi32(res_sat
, c7fffffff
);
4620 res_xor_a
= _mm_xor_si128(res
, a
);
4621 b_xor_a
= _mm_xor_si128(b
, a
);
4622 res_xor_a
= _mm_and_si128(b_xor_a
, res_xor_a
);
4623 res_xor_a
= _mm_srai_epi32(res_xor_a
,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
4624 res_sat
= _mm_and_si128(res_xor_a
, res_sat
);
4625 res
= _mm_andnot_si128(res_xor_a
, res
);
4626 return _mm_or_si128(res
, res_sat
);
4629 int64x2_t
vqsubq_s64(int64x2_t a
, int64x2_t b
); // VQSUB.S64 q0,q0,q0
4630 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqsubq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
) //no optimal SIMD soulution
4632 _NEON2SSE_ALIGN_16
int64_t atmp
[2], btmp
[2];
4633 _NEON2SSE_ALIGN_16
uint64_t res
[2];
4634 _mm_store_si128((__m128i
*)atmp
, a
);
4635 _mm_store_si128((__m128i
*)btmp
, b
);
4636 res
[0] = atmp
[0] - btmp
[0];
4637 res
[1] = atmp
[1] - btmp
[1];
4638 if (((res
[0] ^ atmp
[0]) & _SIGNBIT64
) && ((atmp
[0] ^ btmp
[0]) & _SIGNBIT64
)) {
4639 res
[0] = (atmp
[0] >> 63) ^ ~_SIGNBIT64
;
4641 if (((res
[1] ^ atmp
[1]) & _SIGNBIT64
) && ((atmp
[1] ^ btmp
[1]) & _SIGNBIT64
)) {
4642 res
[1] = (atmp
[1] >> 63) ^ ~_SIGNBIT64
;
4644 return _mm_load_si128((__m128i
*)res
);
4647 uint8x16_t
vqsubq_u8(uint8x16_t a
, uint8x16_t b
); // VQSUB.U8 q0,q0,q0
4648 #define vqsubq_u8 _mm_subs_epu8
4650 uint16x8_t
vqsubq_u16(uint16x8_t a
, uint16x8_t b
); // VQSUB.s16 q0,q0,q0
4651 #define vqsubq_u16 _mm_subs_epu16
4653 uint32x4_t
vqsubq_u32(uint32x4_t a
, uint32x4_t b
); // VQSUB.U32 q0,q0,q0
4654 _NEON2SSE_INLINE uint32x4_t
vqsubq_u32(uint32x4_t a
, uint32x4_t b
) // VQSUB.U32 q0,q0,q0
4656 __m128i min
, mask
, sub
;
4657 min
= _MM_MIN_EPU32(a
, b
); //SSE4.1
4658 mask
= _mm_cmpeq_epi32 (min
, b
);
4659 sub
= _mm_sub_epi32 (a
, b
);
4660 return _mm_and_si128 ( sub
, mask
);
4663 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqsubq_u64(uint64x2_t a
, uint64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
); // VQSUB.U64 q0,q0,q0
4665 _NEON2SSE_INLINE uint64x2_t
vqsubq_u64(uint64x2_t a
, uint64x2_t b
)
4667 __m128i c80000000
, subb
, suba
, cmp
, sub
;
4668 c80000000
= _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
4669 sub
= _mm_sub_epi64 (a
, b
);
4670 suba
= _mm_sub_epi64 (a
, c80000000
);
4671 subb
= _mm_sub_epi64 (b
, c80000000
);
4672 cmp
= _mm_cmpgt_epi64 ( suba
, subb
); //no unsigned comparison, need to go to signed, SSE4.2!!!
4673 return _mm_and_si128 (sub
, cmp
); //saturation
4676 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqsubq_u64(uint64x2_t a
, uint64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
4678 _NEON2SSE_ALIGN_16
uint64_t atmp
[2], btmp
[2], res
[2];
4679 _mm_store_si128((__m128i
*)atmp
, a
);
4680 _mm_store_si128((__m128i
*)btmp
, b
);
4681 res
[0] = (atmp
[0] > btmp
[0]) ? atmp
[0] - btmp
[0] : 0;
4682 res
[1] = (atmp
[1] > btmp
[1]) ? atmp
[1] - btmp
[1] : 0;
4683 return _mm_load_si128((__m128i
*)(res
));
4687 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ******************************************************
4688 //****************************************************************
4689 int8x8_t
vhsub_s8(int8x8_t a
, int8x8_t b
); // VHSUB.S8 d0,d0,d0
4690 _NEON2SSE_INLINE int8x8_t
vhsub_s8(int8x8_t a
, int8x8_t b
) // VHSUB.S8 d0,d0,d0
4692 //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
4697 r16
= _MM_CVTEPI8_EPI16 (_pM128i(r
)); //SSE 4.1
4698 r16
= _mm_srai_epi16 (r16
, 1); //SSE2
4699 r16
= _mm_packs_epi16 (r16
,r16
); //use low 64 bits
4703 int16x4_t
vhsub_s16(int16x4_t a
, int16x4_t b
); // VHSUB.S16 d0,d0,d0
4704 _NEON2SSE_INLINE int16x4_t
vhsub_s16(int16x4_t a
, int16x4_t b
)
4707 return64(vhsubq_s16(_pM128i(a
), _pM128i(b
)));
4712 int32x2_t
vhsub_s32(int32x2_t a
, int32x2_t b
); // VHSUB.S32 d0,d0,d0
4713 _NEON2SSE_INLINE int32x2_t
vhsub_s32(int32x2_t a
, int32x2_t b
)
4716 return64(vhsubq_s32(_pM128i(a
), _pM128i(b
)));
4720 uint8x8_t
vhsub_u8(uint8x8_t a
, uint8x8_t b
); // VHSUB.U8 d0,d0,d0
4721 _NEON2SSE_INLINE uint8x8_t
vhsub_u8(uint8x8_t a
, uint8x8_t b
)
4724 return64(vhsubq_u8(_pM128i(a
), _pM128i(b
)));
4727 uint16x4_t
vhsub_u16(uint16x4_t a
, uint16x4_t b
); // VHSUB.s16 d0,d0,d0
4728 _NEON2SSE_INLINE uint16x4_t
vhsub_u16(uint16x4_t a
, uint16x4_t b
)
4731 return64(vhsubq_u16(_pM128i(a
), _pM128i(b
)));
4734 uint32x2_t
vhsub_u32(uint32x2_t a
, uint32x2_t b
); // VHSUB.U32 d0,d0,d0
4735 _NEON2SSE_INLINE uint32x2_t
vhsub_u32(uint32x2_t a
, uint32x2_t b
)
4738 return64(vhsubq_u32(_pM128i(a
), _pM128i(b
)));
4741 int8x16_t
vhsubq_s8(int8x16_t a
, int8x16_t b
); // VHSUB.S8 q0,q0,q0
4742 _NEON2SSE_INLINE int8x16_t
vhsubq_s8(int8x16_t a
, int8x16_t b
) // VHSUB.S8 q0,q0,q0
4744 // //need to deal with the possibility of internal overflow
4745 __m128i c128
, au
,bu
;
4746 c128
= _mm_set1_epi8 (128);
4747 au
= _mm_add_epi8( a
, c128
);
4748 bu
= _mm_add_epi8( b
, c128
);
4749 return vhsubq_u8(au
,bu
);
4752 int16x8_t
vhsubq_s16(int16x8_t a
, int16x8_t b
); // VHSUB.S16 q0,q0,q0
4753 _NEON2SSE_INLINE int16x8_t
vhsubq_s16(int16x8_t a
, int16x8_t b
) // VHSUB.S16 q0,q0,q0
4755 //need to deal with the possibility of internal overflow
4756 __m128i c8000
, au
,bu
;
4757 c8000
= _mm_set1_epi16(0x8000);
4758 au
= _mm_add_epi16( a
, c8000
);
4759 bu
= _mm_add_epi16( b
, c8000
);
4760 return vhsubq_u16(au
,bu
);
4763 int32x4_t
vhsubq_s32(int32x4_t a
, int32x4_t b
); // VHSUB.S32 q0,q0,q0
4764 _NEON2SSE_INLINE int32x4_t
vhsubq_s32(int32x4_t a
, int32x4_t b
) // VHSUB.S32 q0,q0,q0
4766 //need to deal with the possibility of internal overflow
4767 __m128i a2
, b2
,r
, b_1
;
4768 a2
= _mm_srai_epi32 (a
,1);
4769 b2
= _mm_srai_epi32 (b
,1);
4770 r
= _mm_sub_epi32 (a2
, b2
);
4771 b_1
= _mm_andnot_si128(a
, b
); //!a and b
4772 b_1
= _mm_slli_epi32 (b_1
,31);
4773 b_1
= _mm_srli_epi32 (b_1
,31); //0 or 1, last b bit
4774 return _mm_sub_epi32(r
,b_1
);
4777 uint8x16_t
vhsubq_u8(uint8x16_t a
, uint8x16_t b
); // VHSUB.U8 q0,q0,q0
4778 _NEON2SSE_INLINE uint8x16_t
vhsubq_u8(uint8x16_t a
, uint8x16_t b
) // VHSUB.U8 q0,q0,q0
4781 avg
= _mm_avg_epu8 (a
, b
);
4782 return _mm_sub_epi8(a
, avg
);
4785 uint16x8_t
vhsubq_u16(uint16x8_t a
, uint16x8_t b
); // VHSUB.s16 q0,q0,q0
4786 _NEON2SSE_INLINE uint16x8_t
vhsubq_u16(uint16x8_t a
, uint16x8_t b
) // VHSUB.s16 q0,q0,q0
4789 avg
= _mm_avg_epu16 (a
, b
);
4790 return _mm_sub_epi16(a
, avg
);
4793 uint32x4_t
vhsubq_u32(uint32x4_t a
, uint32x4_t b
); // VHSUB.U32 q0,q0,q0
4794 _NEON2SSE_INLINE uint32x4_t
vhsubq_u32(uint32x4_t a
, uint32x4_t b
) // VHSUB.U32 q0,q0,q0
4796 //need to deal with the possibility of internal overflow
4797 __m128i a2
, b2
,r
, b_1
;
4798 a2
= _mm_srli_epi32 (a
,1);
4799 b2
= _mm_srli_epi32 (b
,1);
4800 r
= _mm_sub_epi32 (a2
, b2
);
4801 b_1
= _mm_andnot_si128(a
, b
); //!a and b
4802 b_1
= _mm_slli_epi32 (b_1
,31);
4803 b_1
= _mm_srli_epi32 (b_1
,31); //0 or 1, last b bit
4804 return _mm_sub_epi32(r
,b_1
);
4807 //******* Vector subtract high half (truncated) ** ************
4808 //************************************************************
4809 int8x8_t
vsubhn_s16(int16x8_t a
, int16x8_t b
); // VSUBHN.I16 d0,q0,q0
4810 _NEON2SSE_INLINE int8x8_t
vsubhn_s16(int16x8_t a
, int16x8_t b
) // VSUBHN.I16 d0,q0,q0
4814 sum
= _mm_sub_epi16 (a
, b
);
4815 sum8
= _mm_srai_epi16 (sum
, 8);
4816 sum8
= _mm_packs_epi16(sum8
,sum8
);
4820 int16x4_t
vsubhn_s32(int32x4_t a
, int32x4_t b
); // VSUBHN.I32 d0,q0,q0
4821 _NEON2SSE_INLINE int16x4_t
vsubhn_s32(int32x4_t a
, int32x4_t b
) // VSUBHN.I32 d0,q0,q0
4825 sum
= _mm_sub_epi32 (a
, b
);
4826 sum16
= _mm_srai_epi32 (sum
, 16);
4827 sum16
= _mm_packs_epi32(sum16
,sum16
);
4831 int32x2_t
vsubhn_s64(int64x2_t a
, int64x2_t b
); // VSUBHN.I64 d0,q0,q0
4832 _NEON2SSE_INLINE int32x2_t
vsubhn_s64(int64x2_t a
, int64x2_t b
)
4836 sub
= _mm_sub_epi64 (a
, b
);
4837 sub
= _mm_shuffle_epi32(sub
, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4841 uint8x8_t
vsubhn_u16(uint16x8_t a
, uint16x8_t b
); // VSUBHN.I16 d0,q0,q0
4842 _NEON2SSE_INLINE uint8x8_t
vsubhn_u16(uint16x8_t a
, uint16x8_t b
) // VSUBHN.I16 d0,q0,q0
4846 sum
= _mm_sub_epi16 (a
, b
);
4847 sum8
= _mm_srli_epi16 (sum
, 8);
4848 sum8
= _mm_packus_epi16(sum8
,sum8
);
4852 uint16x4_t
vsubhn_u32(uint32x4_t a
, uint32x4_t b
); // VSUBHN.I32 d0,q0,q0
4853 _NEON2SSE_INLINE uint16x4_t
vsubhn_u32(uint32x4_t a
, uint32x4_t b
) // VSUBHN.I32 d0,q0,q0
4857 sum
= _mm_sub_epi32 (a
, b
);
4858 sum16
= _mm_srli_epi32 (sum
, 16);
4859 sum16
= _MM_PACKUS1_EPI32(sum16
);
4863 uint32x2_t
vsubhn_u64(uint64x2_t a
, uint64x2_t b
); // VSUBHN.I64 d0,q0,q0
4864 #define vsubhn_u64 vsubhn_s64
4866 //************ Vector rounding subtract high half *********************
4867 //*********************************************************************
4868 int8x8_t
vrsubhn_s16(int16x8_t a
, int16x8_t b
); // VRSUBHN.I16 d0,q0,q0
4869 _NEON2SSE_INLINE int8x8_t
vrsubhn_s16(int16x8_t a
, int16x8_t b
) // VRSUBHN.I16 d0,q0,q0
4873 sub
= _mm_sub_epi16 (a
, b
);
4874 mask1
= _mm_slli_epi16(sub
, 9); //shift left then back right to
4875 mask1
= _mm_srli_epi16(mask1
, 15); //get 7-th bit 1 or zero
4876 sub
= _mm_srai_epi16 (sub
, 8); //get high half
4877 sub
= _mm_add_epi16 (sub
, mask1
); //actual rounding
4878 sub
= _mm_packs_epi16 (sub
, sub
);
4882 int16x4_t
vrsubhn_s32(int32x4_t a
, int32x4_t b
); // VRSUBHN.I32 d0,q0,q0
4883 _NEON2SSE_INLINE int16x4_t
vrsubhn_s32(int32x4_t a
, int32x4_t b
) // VRSUBHN.I32 d0,q0,q0
4885 //SIMD may be not optimal, serial may be faster
4888 sub
= _mm_sub_epi32 (a
, b
);
4889 mask1
= _mm_slli_epi32(sub
, 17); //shift left then back right to
4890 mask1
= _mm_srli_epi32(mask1
,31); //get 15-th bit 1 or zero
4891 sub
= _mm_srai_epi32 (sub
, 16); //get high half
4892 sub
= _mm_add_epi32 (sub
, mask1
); //actual rounding
4893 sub
= _mm_packs_epi32 (sub
, sub
);
4897 int32x2_t
vrsubhn_s64(int64x2_t a
, int64x2_t b
); // VRSUBHN.I64 d0,q0,q0
4898 _NEON2SSE_INLINE int32x2_t
vrsubhn_s64(int64x2_t a
, int64x2_t b
)
4900 //SIMD may be not optimal, serial may be faster
4903 sub
= _mm_sub_epi64 (a
, b
);
4904 mask1
= _mm_slli_epi64(sub
, 33); //shift left then back right to
4905 mask1
= _mm_srli_epi64(mask1
,32); //get 31-th bit 1 or zero
4906 sub
= _mm_add_epi64 (sub
, mask1
); //actual high half rounding
4907 sub
= _mm_shuffle_epi32(sub
, 1 | (3 << 2) | (0 << 4) | (2 << 6));
4911 uint8x8_t
vrsubhn_u16(uint16x8_t a
, uint16x8_t b
); // VRSUBHN.I16 d0,q0,q0
4912 _NEON2SSE_INLINE uint8x8_t
vrsubhn_u16(uint16x8_t a
, uint16x8_t b
) // VRSUBHN.I16 d0,q0,q0
4916 sub
= _mm_sub_epi16 (a
, b
);
4917 mask1
= _mm_slli_epi16(sub
, 9); //shift left then back right to
4918 mask1
= _mm_srli_epi16(mask1
, 15); //get 7-th bit 1 or zero
4919 sub
= _mm_srai_epi16 (sub
, 8); //get high half
4920 sub
= _mm_add_epi16 (sub
, mask1
); //actual rounding
4921 sub
= _mm_packus_epi16 (sub
, sub
);
4925 uint16x4_t
vrsubhn_u32(uint32x4_t a
, uint32x4_t b
); // VRSUBHN.I32 d0,q0,q0
4926 _NEON2SSE_INLINE uint16x4_t
vrsubhn_u32(uint32x4_t a
, uint32x4_t b
) // VRSUBHN.I32 d0,q0,q0
4928 //SIMD may be not optimal, serial may be faster
4931 sub
= _mm_sub_epi32 (a
, b
);
4932 mask1
= _mm_slli_epi32(sub
, 17); //shift left then back right to
4933 mask1
= _mm_srli_epi32(mask1
,31); //get 15-th bit 1 or zero
4934 sub
= _mm_srai_epi32 (sub
, 16); //get high half
4935 sub
= _mm_add_epi32 (sub
, mask1
); //actual rounding
4936 sub
= _MM_PACKUS1_EPI32 (sub
);
4940 uint32x2_t
vrsubhn_u64(uint64x2_t a
, uint64x2_t b
); // VRSUBHN.I64 d0,q0,q0
4941 #define vrsubhn_u64 vrsubhn_s64
4943 //*********** Vector saturating doubling multiply subtract long ********************
4944 //************************************************************************************
4945 int32x4_t
vqdmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VQDMLSL.S16 q0,d0,d0
4946 _NEON2SSE_INLINE int32x4_t
vqdmlsl_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
)
4948 //not optimal SIMD soulution, serial may be faster
4949 __m128i res32
, mask
;
4951 _NEON2SSE_ALIGN_16
uint32_t cmask
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
4952 res
= vmull_s16(b
, c
);
4953 res32
= _mm_slli_epi32 (res
, 1); //double the result, saturation not considered
4954 mask
= _mm_cmpeq_epi32 (res32
, *(__m128i
*)cmask
);
4955 res32
= _mm_xor_si128 (res32
, mask
); //res32 saturated for 0x80000000
4956 return vqsubq_s32(a
, res32
); //saturation
4959 int64x2_t
vqdmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VQDMLSL.S32 q0,d0,d0
4960 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmlsl_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
), _NEON2SSE_REASON_SLOW_SERIAL
)
4962 __m128i res64
, mask
;
4964 _NEON2SSE_ALIGN_16
uint64_t cmask
[] = {0x8000000000000000, 0x8000000000000000};
4965 res
= vmull_s32(b
, c
);
4966 res64
= _mm_slli_epi64 (res
, 1); //double the result, saturation not considered
4967 mask
= _MM_CMPEQ_EPI64 (res64
, *(__m128i
*)cmask
);
4968 res64
= _mm_xor_si128 (res64
, mask
); //res32 saturated for 0x80000000
4969 return vqsubq_s64(a
, res64
); //saturation
4972 //****************** COMPARISON ***************************************
4973 //******************* Vector compare equal *************************************
4974 //****************************************************************************
4975 uint8x8_t
vceq_s8(int8x8_t a
, int8x8_t b
); // VCEQ.I8 d0, d0, d0
4976 _NEON2SSE_INLINE int8x8_t
vceq_s8(int8x8_t a
, int8x8_t b
)
4979 return64(_mm_cmpeq_epi8(_pM128i(a
),_pM128i(b
)));
4983 uint16x4_t
vceq_s16(int16x4_t a
, int16x4_t b
); // VCEQ.I16 d0, d0, d0
4984 _NEON2SSE_INLINE int16x4_t
vceq_s16(int16x4_t a
, int16x4_t b
)
4987 return64(_mm_cmpeq_epi16(_pM128i(a
),_pM128i(b
)));
4991 uint32x2_t
vceq_s32(int32x2_t a
, int32x2_t b
); // VCEQ.I32 d0, d0, d0
4992 _NEON2SSE_INLINE int32x2_t
vceq_s32(int32x2_t a
, int32x2_t b
)
4995 return64(_mm_cmpeq_epi32(_pM128i(a
),_pM128i(b
)));
4999 uint32x2_t
vceq_f32(float32x2_t a
, float32x2_t b
); // VCEQ.F32 d0, d0, d0
5000 _NEON2SSE_INLINE uint32x2_t
vceq_f32(float32x2_t a
, float32x2_t b
)
5004 res
= _mm_cmpeq_ps(_pM128(a
), _pM128(b
) );
5008 uint8x8_t
vceq_u8(uint8x8_t a
, uint8x8_t b
); // VCEQ.I8 d0, d0, d0
5009 _NEON2SSE_INLINE uint8x8_t
vceq_u8(uint8x8_t a
, uint8x8_t b
)
5012 return64(_mm_cmpeq_epi8(_pM128i(a
),_pM128i(b
)));
5016 uint16x4_t
vceq_u16(uint16x4_t a
, uint16x4_t b
); // VCEQ.I16 d0, d0, d0
5017 _NEON2SSE_INLINE uint16x4_t
vceq_u16(uint16x4_t a
, uint16x4_t b
)
5020 return64(_mm_cmpeq_epi16(_pM128i(a
),_pM128i(b
)));
5024 uint32x2_t
vceq_u32(uint32x2_t a
, uint32x2_t b
); // VCEQ.I32 d0, d0, d0
5025 _NEON2SSE_INLINE uint32x2_t
vceq_u32(uint32x2_t a
, uint32x2_t b
)
5028 return64(_mm_cmpeq_epi32(_pM128i(a
),_pM128i(b
)));
5032 uint8x8_t
vceq_p8(poly8x8_t a
, poly8x8_t b
); // VCEQ.I8 d0, d0, d0
5033 #define vceq_p8 vceq_u8
5036 uint8x16_t
vceqq_s8(int8x16_t a
, int8x16_t b
); // VCEQ.I8 q0, q0, q0
5037 #define vceqq_s8 _mm_cmpeq_epi8
5039 uint16x8_t
vceqq_s16(int16x8_t a
, int16x8_t b
); // VCEQ.I16 q0, q0, q0
5040 #define vceqq_s16 _mm_cmpeq_epi16
5042 uint32x4_t
vceqq_s32(int32x4_t a
, int32x4_t b
); // VCEQ.I32 q0, q0, q0
5043 #define vceqq_s32 _mm_cmpeq_epi32
5045 uint32x4_t
vceqq_f32(float32x4_t a
, float32x4_t b
); // VCEQ.F32 q0, q0, q0
5046 _NEON2SSE_INLINE uint32x4_t
vceqq_f32(float32x4_t a
, float32x4_t b
)
5049 res
= _mm_cmpeq_ps(a
,b
);
5053 uint8x16_t
vceqq_u8(uint8x16_t a
, uint8x16_t b
); // VCEQ.I8 q0, q0, q0
5054 #define vceqq_u8 _mm_cmpeq_epi8
5056 uint16x8_t
vceqq_u16(uint16x8_t a
, uint16x8_t b
); // VCEQ.I16 q0, q0, q0
5057 #define vceqq_u16 _mm_cmpeq_epi16
5059 uint32x4_t
vceqq_u32(uint32x4_t a
, uint32x4_t b
); // VCEQ.I32 q0, q0, q0
5060 #define vceqq_u32 _mm_cmpeq_epi32
5062 uint8x16_t
vceqq_p8(poly8x16_t a
, poly8x16_t b
); // VCEQ.I8 q0, q0, q0
5063 #define vceqq_p8 _mm_cmpeq_epi8
5065 //******************Vector compare greater-than or equal*************************
5066 //*******************************************************************************
5067 //in IA SIMD no greater-than-or-equal comparison for integers,
5068 // there is greater-than available only, so we need the following tricks
5070 uint8x8_t
vcge_s8(int8x8_t a
, int8x8_t b
); // VCGE.S8 d0, d0, d0
5071 _NEON2SSE_INLINE int8x8_t
vcge_s8(int8x8_t a
, int8x8_t b
)
5074 return64(vcgeq_s8(_pM128i(a
), _pM128i(b
)));
5078 uint16x4_t
vcge_s16(int16x4_t a
, int16x4_t b
); // VCGE.S16 d0, d0, d0
5079 _NEON2SSE_INLINE int16x4_t
vcge_s16(int16x4_t a
, int16x4_t b
)
5082 return64(vcgeq_s16(_pM128i(a
), _pM128i(b
)));
5086 uint32x2_t
vcge_s32(int32x2_t a
, int32x2_t b
); // VCGE.S32 d0, d0, d0
5087 _NEON2SSE_INLINE int32x2_t
vcge_s32(int32x2_t a
, int32x2_t b
)
5090 return64(vcgeq_s32(_pM128i(a
), _pM128i(b
)));
5094 uint32x2_t
vcge_f32(float32x2_t a
, float32x2_t b
); // VCGE.F32 d0, d0, d0
5095 _NEON2SSE_INLINE uint32x2_t
vcge_f32(float32x2_t a
, float32x2_t b
)
5099 res
= _mm_cmpge_ps(_pM128(a
),_pM128(b
)); //use only 2 first entries
5103 uint8x8_t
vcge_u8(uint8x8_t a
, uint8x8_t b
); // VCGE.U8 d0, d0, d0
5104 _NEON2SSE_INLINE uint8x8_t
vcge_u8(uint8x8_t a
, uint8x8_t b
)
5107 return64(vcgeq_u8(_pM128i(a
), _pM128i(b
)));
5111 uint16x4_t
vcge_u16(uint16x4_t a
, uint16x4_t b
); // VCGE.s16 d0, d0, d0
5112 _NEON2SSE_INLINE uint16x4_t
vcge_u16(uint16x4_t a
, uint16x4_t b
)
5115 return64(vcgeq_u16(_pM128i(a
), _pM128i(b
)));
5119 uint32x2_t
vcge_u32(uint32x2_t a
, uint32x2_t b
); // VCGE.U32 d0, d0, d0
5120 _NEON2SSE_INLINE uint32x2_t
vcge_u32(uint32x2_t a
, uint32x2_t b
)
5122 //serial solution looks faster
5124 return64(vcgeq_u32 (_pM128i(a
), _pM128i(b
)));
5129 uint8x16_t
vcgeq_s8(int8x16_t a
, int8x16_t b
); // VCGE.S8 q0, q0, q0
5130 _NEON2SSE_INLINE uint8x16_t
vcgeq_s8(int8x16_t a
, int8x16_t b
) // VCGE.S8 q0, q0, q0
5133 m1
= _mm_cmpgt_epi8 ( a
, b
);
5134 m2
= _mm_cmpeq_epi8 ( a
, b
);
5135 return _mm_or_si128 ( m1
, m2
);
5138 uint16x8_t
vcgeq_s16(int16x8_t a
, int16x8_t b
); // VCGE.S16 q0, q0, q0
5139 _NEON2SSE_INLINE uint16x8_t
vcgeq_s16(int16x8_t a
, int16x8_t b
) // VCGE.S16 q0, q0, q0
5142 m1
= _mm_cmpgt_epi16 ( a
, b
);
5143 m2
= _mm_cmpeq_epi16 ( a
, b
);
5144 return _mm_or_si128 ( m1
,m2
);
5147 uint32x4_t
vcgeq_s32(int32x4_t a
, int32x4_t b
); // VCGE.S32 q0, q0, q0
5148 _NEON2SSE_INLINE uint32x4_t
vcgeq_s32(int32x4_t a
, int32x4_t b
) // VCGE.S32 q0, q0, q0
5151 m1
= _mm_cmpgt_epi32 (a
, b
);
5152 m2
= _mm_cmpeq_epi32 (a
, b
);
5153 return _mm_or_si128 (m1
, m2
);
5156 uint32x4_t
vcgeq_f32(float32x4_t a
, float32x4_t b
); // VCGE.F32 q0, q0, q0
5157 _NEON2SSE_INLINE uint32x4_t
vcgeq_f32(float32x4_t a
, float32x4_t b
)
5160 res
= _mm_cmpge_ps(a
,b
); //use only 2 first entries
5161 return *(__m128i
*)&res
;
5164 uint8x16_t
vcgeq_u8(uint8x16_t a
, uint8x16_t b
); // VCGE.U8 q0, q0, q0
5165 _NEON2SSE_INLINE uint8x16_t
vcgeq_u8(uint8x16_t a
, uint8x16_t b
) // VCGE.U8 q0, q0, q0
5167 //no unsigned chars comparison, only signed available,so need the trick
5170 cmp
= _mm_max_epu8(a
, b
);
5171 return _mm_cmpeq_epi8(cmp
, a
); //a>=b
5173 __m128i c128
, as
, bs
, m1
, m2
;
5174 c128
= _mm_set1_epi8 (128);
5175 as
= _mm_sub_epi8( a
, c128
);
5176 bs
= _mm_sub_epi8( b
, c128
);
5177 m1
= _mm_cmpgt_epi8( as
, bs
);
5178 m2
= _mm_cmpeq_epi8 (as
, bs
);
5179 return _mm_or_si128 ( m1
, m2
);
5183 uint16x8_t
vcgeq_u16(uint16x8_t a
, uint16x8_t b
); // VCGE.s16 q0, q0, q0
5184 _NEON2SSE_INLINE uint16x8_t
vcgeq_u16(uint16x8_t a
, uint16x8_t b
) // VCGE.s16 q0, q0, q0
5186 //no unsigned shorts comparison, only signed available,so need the trick
5189 cmp
= _mm_max_epu16(a
, b
);
5190 return _mm_cmpeq_epi16(cmp
, a
); //a>=b
5192 __m128i c8000
, as
, bs
, m1
, m2
;
5193 c8000
= _mm_set1_epi16 (0x8000);
5194 as
= _mm_sub_epi16(a
,c8000
);
5195 bs
= _mm_sub_epi16(b
,c8000
);
5196 m1
= _mm_cmpgt_epi16(as
, bs
);
5197 m2
= _mm_cmpeq_epi16 (as
, bs
);
5198 return _mm_or_si128 ( m1
, m2
);
5202 uint32x4_t
vcgeq_u32(uint32x4_t a
, uint32x4_t b
); // VCGE.U32 q0, q0, q0
5203 _NEON2SSE_INLINE uint32x4_t
vcgeq_u32(uint32x4_t a
, uint32x4_t b
) // VCGE.U32 q0, q0, q0
5205 //no unsigned ints comparison, only signed available,so need the trick
5208 cmp
= _mm_max_epu32(a
, b
);
5209 return _mm_cmpeq_epi32(cmp
, a
); //a>=b
5211 //serial solution may be faster
5212 __m128i c80000000
, as
, bs
, m1
, m2
;
5213 c80000000
= _mm_set1_epi32 (0x80000000);
5214 as
= _mm_sub_epi32(a
,c80000000
);
5215 bs
= _mm_sub_epi32(b
,c80000000
);
5216 m1
= _mm_cmpgt_epi32 (as
, bs
);
5217 m2
= _mm_cmpeq_epi32 (as
, bs
);
5218 return _mm_or_si128 ( m1
, m2
);
5222 //**********************Vector compare less-than or equal******************************
5223 //***************************************************************************************
5224 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
5226 uint8x8_t
vcle_s8(int8x8_t a
, int8x8_t b
); // VCGE.S8 d0, d0, d0
5227 _NEON2SSE_INLINE int8x8_t
vcle_s8(int8x8_t a
, int8x8_t b
)
5230 return64(vcleq_s8(_pM128i(a
), _pM128i(b
)));
5234 uint16x4_t
vcle_s16(int16x4_t a
, int16x4_t b
); // VCGE.S16 d0, d0, d0
5235 _NEON2SSE_INLINE int16x4_t
vcle_s16(int16x4_t a
, int16x4_t b
)
5238 return64(vcleq_s16(_pM128i(a
), _pM128i(b
)));
5242 uint32x2_t
vcle_s32(int32x2_t a
, int32x2_t b
); // VCGE.S32 d0, d0, d0
5243 _NEON2SSE_INLINE int32x2_t
vcle_s32(int32x2_t a
, int32x2_t b
)
5246 return64(vcleq_s32(_pM128i(a
), _pM128i(b
)));
5250 uint32x2_t
vcle_f32(float32x2_t a
, float32x2_t b
); // VCGE.F32 d0, d0, d0?
5251 _NEON2SSE_INLINE uint32x2_t
vcle_f32(float32x2_t a
, float32x2_t b
)
5255 res
= _mm_cmple_ps(_pM128(a
),_pM128(b
));
5259 uint8x8_t
vcle_u8(uint8x8_t a
, uint8x8_t b
); // VCGE.U8 d0, d0, d0
5260 #define vcle_u8(a,b) vcge_u8(b,a)
5263 uint16x4_t
vcle_u16(uint16x4_t a
, uint16x4_t b
); // VCGE.s16 d0, d0, d0
5264 #define vcle_u16(a,b) vcge_u16(b,a)
5267 uint32x2_t
vcle_u32(uint32x2_t a
, uint32x2_t b
); // VCGE.U32 d0, d0, d0
5268 #define vcle_u32(a,b) vcge_u32(b,a)
5270 uint8x16_t
vcleq_s8(int8x16_t a
, int8x16_t b
); // VCGE.S8 q0, q0, q0
5271 _NEON2SSE_INLINE uint8x16_t
vcleq_s8(int8x16_t a
, int8x16_t b
) // VCGE.S8 q0, q0, q0
5274 c1
= _mm_cmpeq_epi8 (a
,a
); //all ones 0xff....
5275 res
= _mm_cmpgt_epi8 ( a
, b
);
5276 return _mm_andnot_si128 (res
, c1
); //inverse the cmpgt result, get less-than-or-equal
5279 uint16x8_t
vcleq_s16(int16x8_t a
, int16x8_t b
); // VCGE.S16 q0, q0, q0
5280 _NEON2SSE_INLINE uint16x8_t
vcleq_s16(int16x8_t a
, int16x8_t b
) // VCGE.S16 q0, q0, q0
5283 c1
= _mm_cmpeq_epi16 (a
,a
); //all ones 0xff....
5284 res
= _mm_cmpgt_epi16 ( a
, b
);
5285 return _mm_andnot_si128 (res
, c1
);
5288 uint32x4_t
vcleq_s32(int32x4_t a
, int32x4_t b
); // VCGE.S32 q0, q0, q0
5289 _NEON2SSE_INLINE uint32x4_t
vcleq_s32(int32x4_t a
, int32x4_t b
) // VCGE.S32 q0, q0, q0
5292 c1
= _mm_cmpeq_epi32 (a
,a
); //all ones 0xff....
5293 res
= _mm_cmpgt_epi32 ( a
, b
);
5294 return _mm_andnot_si128 (res
, c1
);
5297 uint32x4_t
vcleq_f32(float32x4_t a
, float32x4_t b
); // VCGE.F32 q0, q0, q0
5298 _NEON2SSE_INLINE uint32x4_t
vcleq_f32(float32x4_t a
, float32x4_t b
)
5301 res
= _mm_cmple_ps(a
,b
);
5302 return *(__m128i
*)&res
;
5305 uint8x16_t
vcleq_u8(uint8x16_t a
, uint8x16_t b
); // VCGE.U8 q0, q0, q0
5307 _NEON2SSE_INLINE uint8x16_t
vcleq_u8(uint8x16_t a
, uint8x16_t b
) // VCGE.U8 q0, q0, q0
5309 //no unsigned chars comparison in SSE, only signed available,so need the trick
5311 cmp
= _mm_min_epu8(a
, b
);
5312 return _mm_cmpeq_epi8(cmp
, a
); //a<=b
5315 #define vcleq_u8(a,b) vcgeq_u8(b,a)
5319 uint16x8_t
vcleq_u16(uint16x8_t a
, uint16x8_t b
); // VCGE.s16 q0, q0, q0
5321 _NEON2SSE_INLINE uint16x8_t
vcleq_u16(uint16x8_t a
, uint16x8_t b
) // VCGE.s16 q0, q0, q0
5323 //no unsigned shorts comparison in SSE, only signed available,so need the trick
5325 cmp
= _mm_min_epu16(a
, b
);
5326 return _mm_cmpeq_epi16(cmp
, a
); //a<=b
5329 #define vcleq_u16(a,b) vcgeq_u16(b,a)
5333 uint32x4_t
vcleq_u32(uint32x4_t a
, uint32x4_t b
); // VCGE.U32 q0, q0, q0
5335 _NEON2SSE_INLINE uint32x4_t
vcleq_u32(uint32x4_t a
, uint32x4_t b
) // VCGE.U32 q0, q0, q0
5337 //no unsigned chars comparison in SSE, only signed available,so need the trick
5339 cmp
= _mm_min_epu32(a
, b
);
5340 return _mm_cmpeq_epi32(cmp
, a
); //a<=b
5343 //solution may be not optimal compared with the serial one
5344 #define vcleq_u32(a,b) vcgeq_u32(b,a)
5348 //****** Vector compare greater-than ******************************************
5349 //**************************************************************************
5350 uint8x8_t
vcgt_s8(int8x8_t a
, int8x8_t b
); // VCGT.S8 d0, d0, d0
5351 _NEON2SSE_INLINE int8x8_t
vcgt_s8(int8x8_t a
, int8x8_t b
)
5354 return64(_mm_cmpgt_epi8(_pM128i(a
),_pM128i(b
)));
5358 uint16x4_t
vcgt_s16(int16x4_t a
, int16x4_t b
); // VCGT.S16 d0, d0, d0
5359 _NEON2SSE_INLINE int16x4_t
vcgt_s16(int16x4_t a
, int16x4_t b
)
5362 return64(_mm_cmpgt_epi16(_pM128i(a
),_pM128i(b
)));
5366 uint32x2_t
vcgt_s32(int32x2_t a
, int32x2_t b
); // VCGT.S32 d0, d0, d0
5367 _NEON2SSE_INLINE int32x2_t
vcgt_s32(int32x2_t a
, int32x2_t b
)
5370 return64(_mm_cmpgt_epi32(_pM128i(a
),_pM128i(b
)));
5374 uint32x2_t
vcgt_f32(float32x2_t a
, float32x2_t b
); // VCGT.F32 d0, d0, d0
5375 _NEON2SSE_INLINE uint32x2_t
vcgt_f32(float32x2_t a
, float32x2_t b
)
5379 res
= _mm_cmpgt_ps(_pM128(a
),_pM128(b
)); //use only 2 first entries
5383 uint8x8_t
vcgt_u8(uint8x8_t a
, uint8x8_t b
); // VCGT.U8 d0, d0, d0
5384 _NEON2SSE_INLINE uint8x8_t
vcgt_u8(uint8x8_t a
, uint8x8_t b
)
5387 return64(vcgtq_u8(_pM128i(a
), _pM128i(b
)));
5391 uint16x4_t
vcgt_u16(uint16x4_t a
, uint16x4_t b
); // VCGT.s16 d0, d0, d0
5392 _NEON2SSE_INLINE uint16x4_t
vcgt_u16(uint16x4_t a
, uint16x4_t b
)
5395 return64(vcgtq_u16(_pM128i(a
), _pM128i(b
)));
5399 uint32x2_t
vcgt_u32(uint32x2_t a
, uint32x2_t b
); // VCGT.U32 d0, d0, d0
5400 _NEON2SSE_INLINE uint32x2_t
vcgt_u32(uint32x2_t a
, uint32x2_t b
)
5403 return64(vcgtq_u32(_pM128i(a
), _pM128i(b
)));
5407 uint8x16_t
vcgtq_s8(int8x16_t a
, int8x16_t b
); // VCGT.S8 q0, q0, q0
5408 #define vcgtq_s8 _mm_cmpgt_epi8
5410 uint16x8_t
vcgtq_s16(int16x8_t a
, int16x8_t b
); // VCGT.S16 q0, q0, q0
5411 #define vcgtq_s16 _mm_cmpgt_epi16
5413 uint32x4_t
vcgtq_s32(int32x4_t a
, int32x4_t b
); // VCGT.S32 q0, q0, q0
5414 #define vcgtq_s32 _mm_cmpgt_epi32
5416 uint32x4_t
vcgtq_f32(float32x4_t a
, float32x4_t b
); // VCGT.F32 q0, q0, q0
5417 _NEON2SSE_INLINE uint32x4_t
vcgtq_f32(float32x4_t a
, float32x4_t b
)
5420 res
= _mm_cmpgt_ps(a
,b
); //use only 2 first entries
5421 return *(__m128i
*)&res
;
5424 uint8x16_t
vcgtq_u8(uint8x16_t a
, uint8x16_t b
); // VCGT.U8 q0, q0, q0
5425 _NEON2SSE_INLINE uint8x16_t
vcgtq_u8(uint8x16_t a
, uint8x16_t b
) // VCGT.U8 q0, q0, q0
5427 //no unsigned chars comparison, only signed available,so need the trick
5428 __m128i c128
, as
, bs
;
5429 c128
= _mm_set1_epi8 (128);
5430 as
= _mm_sub_epi8(a
,c128
);
5431 bs
= _mm_sub_epi8(b
,c128
);
5432 return _mm_cmpgt_epi8 (as
, bs
);
5435 uint16x8_t
vcgtq_u16(uint16x8_t a
, uint16x8_t b
); // VCGT.s16 q0, q0, q0
5436 _NEON2SSE_INLINE uint16x8_t
vcgtq_u16(uint16x8_t a
, uint16x8_t b
) // VCGT.s16 q0, q0, q0
5438 //no unsigned short comparison, only signed available,so need the trick
5439 __m128i c8000
, as
, bs
;
5440 c8000
= _mm_set1_epi16 (0x8000);
5441 as
= _mm_sub_epi16(a
,c8000
);
5442 bs
= _mm_sub_epi16(b
,c8000
);
5443 return _mm_cmpgt_epi16 ( as
, bs
);
5446 uint32x4_t
vcgtq_u32(uint32x4_t a
, uint32x4_t b
); // VCGT.U32 q0, q0, q0
5447 _NEON2SSE_INLINE uint32x4_t
vcgtq_u32(uint32x4_t a
, uint32x4_t b
) // VCGT.U32 q0, q0, q0
5449 //no unsigned int comparison, only signed available,so need the trick
5450 __m128i c80000000
, as
, bs
;
5451 c80000000
= _mm_set1_epi32 (0x80000000);
5452 as
= _mm_sub_epi32(a
,c80000000
);
5453 bs
= _mm_sub_epi32(b
,c80000000
);
5454 return _mm_cmpgt_epi32 ( as
, bs
);
5457 //********************* Vector compare less-than **************************
5458 //*************************************************************************
5459 uint8x8_t
vclt_s8(int8x8_t a
, int8x8_t b
); // VCGT.S8 d0, d0, d0
5460 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
5463 uint16x4_t
vclt_s16(int16x4_t a
, int16x4_t b
); // VCGT.S16 d0, d0, d0
5464 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
5467 uint32x2_t
vclt_s32(int32x2_t a
, int32x2_t b
); // VCGT.S32 d0, d0, d0
5468 #define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!!
5471 uint32x2_t
vclt_f32(float32x2_t a
, float32x2_t b
); // VCGT.F32 d0, d0, d0
5472 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
5474 uint8x8_t
vclt_u8(uint8x8_t a
, uint8x8_t b
); // VCGT.U8 d0, d0, d0
5475 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
5477 uint16x4_t
vclt_u16(uint16x4_t a
, uint16x4_t b
); // VCGT.s16 d0, d0, d0
5478 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
5480 uint32x2_t
vclt_u32(uint32x2_t a
, uint32x2_t b
); // VCGT.U32 d0, d0, d0
5481 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
5483 uint8x16_t
vcltq_s8(int8x16_t a
, int8x16_t b
); // VCGT.S8 q0, q0, q0
5484 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
5486 uint16x8_t
vcltq_s16(int16x8_t a
, int16x8_t b
); // VCGT.S16 q0, q0, q0
5487 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
5489 uint32x4_t
vcltq_s32(int32x4_t a
, int32x4_t b
); // VCGT.S32 q0, q0, q0
5490 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
5492 uint32x4_t
vcltq_f32(float32x4_t a
, float32x4_t b
); // VCGT.F32 q0, q0, q0
5493 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
5495 uint8x16_t
vcltq_u8(uint8x16_t a
, uint8x16_t b
); // VCGT.U8 q0, q0, q0
5496 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
5498 uint16x8_t
vcltq_u16(uint16x8_t a
, uint16x8_t b
); // VCGT.s16 q0, q0, q0
5499 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
5501 uint32x4_t
vcltq_u32(uint32x4_t a
, uint32x4_t b
); // VCGT.U32 q0, q0, q0
5502 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
5504 //*****************Vector compare absolute greater-than or equal ************
5505 //***************************************************************************
5506 uint32x2_t
vcage_f32(float32x2_t a
, float32x2_t b
); // VACGE.F32 d0, d0, d0
5507 _NEON2SSE_INLINE uint32x2_t
vcage_f32(float32x2_t a
, float32x2_t b
)
5512 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5513 a0
= _mm_and_ps (_pM128(a
), *(__m128
*)&c7fffffff
);
5514 b0
= _mm_and_ps (_pM128(b
), *(__m128
*)&c7fffffff
);
5515 a0
= _mm_cmpge_ps ( a0
, b0
);
5519 uint32x4_t
vcageq_f32(float32x4_t a
, float32x4_t b
); // VACGE.F32 q0, q0, q0
5520 _NEON2SSE_INLINE uint32x4_t
vcageq_f32(float32x4_t a
, float32x4_t b
) // VACGE.F32 q0, q0, q0
5524 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5525 a0
= _mm_and_ps (a
, *(__m128
*)&c7fffffff
);
5526 b0
= _mm_and_ps (b
, *(__m128
*)&c7fffffff
);
5527 a0
= _mm_cmpge_ps ( a0
, b0
);
5528 return (*(__m128i
*)&a0
);
5531 //********Vector compare absolute less-than or equal ******************
5532 //********************************************************************
5533 uint32x2_t
vcale_f32(float32x2_t a
, float32x2_t b
); // VACGE.F32 d0, d0, d0
5534 _NEON2SSE_INLINE uint32x2_t
vcale_f32(float32x2_t a
, float32x2_t b
)
5539 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5540 a0
= _mm_and_ps (_pM128(a
), *(__m128
*)&c7fffffff
);
5541 b0
= _mm_and_ps (_pM128(b
), *(__m128
*)&c7fffffff
);
5542 a0
= _mm_cmple_ps (a0
, b0
);
5546 uint32x4_t
vcaleq_f32(float32x4_t a
, float32x4_t b
); // VACGE.F32 q0, q0, q0
5547 _NEON2SSE_INLINE uint32x4_t
vcaleq_f32(float32x4_t a
, float32x4_t b
) // VACGE.F32 q0, q0, q0
5551 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5552 a0
= _mm_and_ps (a
, *(__m128
*)&c7fffffff
);
5553 b0
= _mm_and_ps (b
, *(__m128
*)&c7fffffff
);
5554 a0
= _mm_cmple_ps (a0
, b0
);
5555 return (*(__m128i
*)&a0
);
5558 //******** Vector compare absolute greater-than ******************
5559 //******************************************************************
5560 uint32x2_t
vcagt_f32(float32x2_t a
, float32x2_t b
); // VACGT.F32 d0, d0, d0
5561 _NEON2SSE_INLINE uint32x2_t
vcagt_f32(float32x2_t a
, float32x2_t b
)
5566 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5567 a0
= _mm_and_ps (_pM128(a
), *(__m128
*)&c7fffffff
);
5568 b0
= _mm_and_ps (_pM128(b
), *(__m128
*)&c7fffffff
);
5569 a0
= _mm_cmpgt_ps (a0
, b0
);
5573 uint32x4_t
vcagtq_f32(float32x4_t a
, float32x4_t b
); // VACGT.F32 q0, q0, q0
5574 _NEON2SSE_INLINE uint32x4_t
vcagtq_f32(float32x4_t a
, float32x4_t b
) // VACGT.F32 q0, q0, q0
5578 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5579 a0
= _mm_and_ps (a
, *(__m128
*)&c7fffffff
);
5580 b0
= _mm_and_ps (b
, *(__m128
*)&c7fffffff
);
5581 a0
= _mm_cmpgt_ps (a0
, b0
);
5582 return (*(__m128i
*)&a0
);
5585 //***************Vector compare absolute less-than ***********************
5586 //*************************************************************************
5587 uint32x2_t
vcalt_f32(float32x2_t a
, float32x2_t b
); // VACGT.F32 d0, d0, d0
5588 _NEON2SSE_INLINE uint32x2_t
vcalt_f32(float32x2_t a
, float32x2_t b
)
5593 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5594 a0
= _mm_and_ps (_pM128(a
), *(__m128
*)&c7fffffff
);
5595 b0
= _mm_and_ps (_pM128(b
), *(__m128
*)&c7fffffff
);
5596 a0
= _mm_cmplt_ps (a0
, b0
);
5600 uint32x4_t
vcaltq_f32(float32x4_t a
, float32x4_t b
); // VACGT.F32 q0, q0, q0
5601 _NEON2SSE_INLINE uint32x4_t
vcaltq_f32(float32x4_t a
, float32x4_t b
) // VACGT.F32 q0, q0, q0
5605 c7fffffff
= _mm_set1_epi32 (0x7fffffff);
5606 a0
= _mm_and_ps (a
, *(__m128
*)&c7fffffff
);
5607 b0
= _mm_and_ps (b
, *(__m128
*)&c7fffffff
);
5608 a0
= _mm_cmplt_ps (a0
, b0
);
5609 return (*(__m128i
*)&a0
);
5612 //*************************Vector test bits************************************
5613 //*****************************************************************************
5614 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
5615 with the corresponding element of a second vector. If the result is not zero, the
5616 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
5619 uint8x8_t
vtst_s8(int8x8_t a
, int8x8_t b
); // VTST.8 d0, d0, d0
5620 _NEON2SSE_INLINE uint8x8_t
vtst_s8(int8x8_t a
, int8x8_t b
)
5623 return64(vtstq_s8(_pM128i(a
), _pM128i(b
)));
5627 uint16x4_t
vtst_s16(int16x4_t a
, int16x4_t b
); // VTST.16 d0, d0, d0
5628 _NEON2SSE_INLINE uint16x4_t
vtst_s16(int16x4_t a
, int16x4_t b
)
5631 return64(vtstq_s16(_pM128i(a
), _pM128i(b
)));
5635 uint32x2_t
vtst_s32(int32x2_t a
, int32x2_t b
); // VTST.32 d0, d0, d0
5636 _NEON2SSE_INLINE uint32x2_t
vtst_s32(int32x2_t a
, int32x2_t b
)
5639 return64(vtstq_s32(_pM128i(a
), _pM128i(b
)));
5643 uint8x8_t
vtst_u8(uint8x8_t a
, uint8x8_t b
); // VTST.8 d0, d0, d0
5644 #define vtst_u8 vtst_s8
5646 uint16x4_t
vtst_u16(uint16x4_t a
, uint16x4_t b
); // VTST.16 d0, d0, d0
5647 #define vtst_u16 vtst_s16
5649 uint32x2_t
vtst_u32(uint32x2_t a
, uint32x2_t b
); // VTST.32 d0, d0, d0
5650 #define vtst_u32 vtst_s32
5653 uint8x8_t
vtst_p8(poly8x8_t a
, poly8x8_t b
); // VTST.8 d0, d0, d0
5654 #define vtst_p8 vtst_u8
5656 uint8x16_t
vtstq_s8(int8x16_t a
, int8x16_t b
); // VTST.8 q0, q0, q0
5657 _NEON2SSE_INLINE uint8x16_t
vtstq_s8(int8x16_t a
, int8x16_t b
) // VTST.8 q0, q0, q0
5659 __m128i zero
, one
, res
;
5660 zero
= _mm_setzero_si128 ();
5661 one
= _mm_cmpeq_epi8(zero
,zero
); //0xfff..ffff
5662 res
= _mm_and_si128 (a
, b
);
5663 res
= _mm_cmpeq_epi8 (res
, zero
);
5664 return _mm_xor_si128(res
, one
); //invert result
5667 uint16x8_t
vtstq_s16(int16x8_t a
, int16x8_t b
); // VTST.16 q0, q0, q0
5668 _NEON2SSE_INLINE uint16x8_t
vtstq_s16(int16x8_t a
, int16x8_t b
) // VTST.16 q0, q0, q0
5670 __m128i zero
, one
, res
;
5671 zero
= _mm_setzero_si128 ();
5672 one
= _mm_cmpeq_epi8(zero
,zero
); //0xfff..ffff
5673 res
= _mm_and_si128 (a
, b
);
5674 res
= _mm_cmpeq_epi16 (res
, zero
);
5675 return _mm_xor_si128(res
, one
); //invert result
5678 uint32x4_t
vtstq_s32(int32x4_t a
, int32x4_t b
); // VTST.32 q0, q0, q0
5679 _NEON2SSE_INLINE uint32x4_t
vtstq_s32(int32x4_t a
, int32x4_t b
) // VTST.32 q0, q0, q0
5681 __m128i zero
, one
, res
;
5682 zero
= _mm_setzero_si128 ();
5683 one
= _mm_cmpeq_epi8(zero
,zero
); //0xfff..ffff
5684 res
= _mm_and_si128 (a
, b
);
5685 res
= _mm_cmpeq_epi32 (res
, zero
);
5686 return _mm_xor_si128(res
, one
); //invert result
5689 uint8x16_t
vtstq_u8(uint8x16_t a
, uint8x16_t b
); // VTST.8 q0, q0, q0
5690 #define vtstq_u8 vtstq_s8
5692 uint16x8_t
vtstq_u16(uint16x8_t a
, uint16x8_t b
); // VTST.16 q0, q0, q0
5693 #define vtstq_u16 vtstq_s16
5695 uint32x4_t
vtstq_u32(uint32x4_t a
, uint32x4_t b
); // VTST.32 q0, q0, q0
5696 #define vtstq_u32 vtstq_s32
5698 uint8x16_t
vtstq_p8(poly8x16_t a
, poly8x16_t b
); // VTST.8 q0, q0, q0
5699 #define vtstq_p8 vtstq_u8
5701 //****************** Absolute difference ********************
5702 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
5703 //************************************************************
5704 int8x8_t
vabd_s8(int8x8_t a
, int8x8_t b
); // VABD.S8 d0,d0,d0
5705 _NEON2SSE_INLINE int8x8_t
vabd_s8(int8x8_t a
, int8x8_t b
)
5708 return64(vabdq_s8(_pM128i(a
), _pM128i(b
)));
5711 int16x4_t
vabd_s16(int16x4_t a
, int16x4_t b
); // VABD.S16 d0,d0,d0
5712 _NEON2SSE_INLINE int16x4_t
vabd_s16(int16x4_t a
, int16x4_t b
)
5715 return64(vabdq_s16(_pM128i(a
), _pM128i(b
)));
5718 int32x2_t
vabd_s32(int32x2_t a
, int32x2_t b
); // VABD.S32 d0,d0,d0
5719 _NEON2SSE_INLINE int32x2_t
vabd_s32(int32x2_t a
, int32x2_t b
)
5722 return64(vabdq_s32(_pM128i(a
), _pM128i(b
)));
5725 uint8x8_t
vabd_u8(uint8x8_t a
, uint8x8_t b
); // VABD.U8 d0,d0,d0
5726 _NEON2SSE_INLINE uint8x8_t
vabd_u8(uint8x8_t a
, uint8x8_t b
)
5729 return64(vabdq_u8(_pM128i(a
), _pM128i(b
)));
5732 uint16x4_t
vabd_u16(uint16x4_t a
, uint16x4_t b
); // VABD.s16 d0,d0,d0
5733 _NEON2SSE_INLINE uint16x4_t
vabd_u16(uint16x4_t a
, uint16x4_t b
)
5736 return64(vabdq_u16(_pM128i(a
), _pM128i(b
)));
5739 uint32x2_t
vabd_u32(uint32x2_t a
, uint32x2_t b
); // VABD.U32 d0,d0,d0
5740 _NEON2SSE_INLINE uint32x2_t
vabd_u32(uint32x2_t a
, uint32x2_t b
)
5743 return64(vabdq_u32(_pM128i(a
), _pM128i(b
)));
5746 float32x2_t
vabd_f32(float32x2_t a
, float32x2_t b
); // VABD.F32 d0,d0,d0
5747 _NEON2SSE_INLINE float32x2_t
vabd_f32(float32x2_t a
, float32x2_t b
)
5751 res
= vabdq_f32(_pM128(a
), _pM128(b
));
5756 int8x16_t
vabdq_s8(int8x16_t a
, int8x16_t b
); // VABD.S8 q0,q0,q0
5757 _NEON2SSE_INLINE int8x16_t
vabdq_s8(int8x16_t a
, int8x16_t b
) // VABD.S8 q0,q0,q0
5760 res
= _mm_sub_epi8 (a
, b
);
5761 return _mm_abs_epi8 (res
);
5764 int16x8_t
vabdq_s16(int16x8_t a
, int16x8_t b
); // VABD.S16 q0,q0,q0
5765 _NEON2SSE_INLINE int16x8_t
vabdq_s16(int16x8_t a
, int16x8_t b
) // VABD.S16 q0,q0,q0
5768 res
= _mm_sub_epi16 (a
,b
);
5769 return _mm_abs_epi16 (res
);
5772 int32x4_t
vabdq_s32(int32x4_t a
, int32x4_t b
); // VABD.S32 q0,q0,q0
5773 _NEON2SSE_INLINE int32x4_t
vabdq_s32(int32x4_t a
, int32x4_t b
) // VABD.S32 q0,q0,q0
5776 res
= _mm_sub_epi32 (a
,b
);
5777 return _mm_abs_epi32 (res
);
5780 uint8x16_t
vabdq_u8(uint8x16_t a
, uint8x16_t b
); // VABD.U8 q0,q0,q0
5781 _NEON2SSE_INLINE uint8x16_t
vabdq_u8(uint8x16_t a
, uint8x16_t b
) //no abs for unsigned
5783 __m128i cmp
, difab
, difba
;
5784 cmp
= vcgtq_u8(a
,b
);
5785 difab
= _mm_sub_epi8(a
,b
);
5786 difba
= _mm_sub_epi8 (b
,a
);
5787 difab
= _mm_and_si128(cmp
, difab
);
5788 difba
= _mm_andnot_si128(cmp
, difba
);
5789 return _mm_or_si128(difab
, difba
);
5792 uint16x8_t
vabdq_u16(uint16x8_t a
, uint16x8_t b
); // VABD.s16 q0,q0,q0
5793 _NEON2SSE_INLINE uint16x8_t
vabdq_u16(uint16x8_t a
, uint16x8_t b
)
5795 __m128i cmp
, difab
, difba
;
5796 cmp
= vcgtq_u16(a
,b
);
5797 difab
= _mm_sub_epi16(a
,b
);
5798 difba
= _mm_sub_epi16 (b
,a
);
5799 difab
= _mm_and_si128(cmp
, difab
);
5800 difba
= _mm_andnot_si128(cmp
, difba
);
5801 return _mm_or_si128(difab
, difba
);
5804 uint32x4_t
vabdq_u32(uint32x4_t a
, uint32x4_t b
); // VABD.U32 q0,q0,q0
5805 _NEON2SSE_INLINE uint32x4_t
vabdq_u32(uint32x4_t a
, uint32x4_t b
)
5807 __m128i cmp
, difab
, difba
;
5808 cmp
= vcgtq_u32(a
,b
);
5809 difab
= _mm_sub_epi32(a
,b
);
5810 difba
= _mm_sub_epi32 (b
,a
);
5811 difab
= _mm_and_si128(cmp
, difab
);
5812 difba
= _mm_andnot_si128(cmp
, difba
);
5813 return _mm_or_si128(difab
, difba
);
5816 float32x4_t
vabdq_f32(float32x4_t a
, float32x4_t b
); // VABD.F32 q0,q0,q0
5817 _NEON2SSE_INLINE float32x4_t
vabdq_f32(float32x4_t a
, float32x4_t b
) // VABD.F32 q0,q0,q0
5821 c1
= _mm_set1_epi32(0x7fffffff);
5822 res
= _mm_sub_ps (a
, b
);
5823 return _mm_and_ps (res
, *(__m128
*)&c1
);
5826 //************ Absolute difference - long **************************
5827 //********************************************************************
5828 int16x8_t
vabdl_s8(int8x8_t a
, int8x8_t b
); // VABDL.S8 q0,d0,d0
5829 _NEON2SSE_INLINE int16x8_t
vabdl_s8(int8x8_t a
, int8x8_t b
) // VABDL.S8 q0,d0,d0
5832 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE4.1,
5833 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
5834 return vabdq_s16(a16
, b16
);
5838 int32x4_t
vabdl_s16(int16x4_t a
, int16x4_t b
); // VABDL.S16 q0,d0,d0
5839 _NEON2SSE_INLINE int32x4_t
vabdl_s16(int16x4_t a
, int16x4_t b
) // VABDL.S16 q0,d0,d0
5842 a32
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); //SSE4.1
5843 b32
= _MM_CVTEPI16_EPI32 (_pM128i(b
)); //SSE4.1,
5844 return vabdq_s32(a32
, b32
);
5847 int64x2_t
vabdl_s32(int32x2_t a
, int32x2_t b
); // VABDL.S32 q0,d0,d0
5848 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING (int64x2_t
vabdl_s32(int32x2_t a
, int32x2_t b
),_NEON2SSE_REASON_SLOW_SERIAL
)
5850 //no optimal SIMD solution, serial looks faster
5851 _NEON2SSE_ALIGN_16
int64_t res
[2];
5852 if(a
.m64_i32
[0] > b
.m64_i32
[0]) res
[0] = ( int64_t) a
.m64_i32
[0] - ( int64_t) b
.m64_i32
[0];
5853 else res
[0] = ( int64_t) b
.m64_i32
[0] - ( int64_t) a
.m64_i32
[0];
5854 if(a
.m64_i32
[1] > b
.m64_i32
[1]) res
[1] = ( int64_t) a
.m64_i32
[1] - ( int64_t) b
.m64_i32
[1];
5855 else res
[1] = ( int64_t) b
.m64_i32
[1] - ( int64_t) a
.m64_i32
[1];
5856 return _mm_load_si128((__m128i
*)res
);
5859 uint16x8_t
vabdl_u8(uint8x8_t a
, uint8x8_t b
); // VABDL.U8 q0,d0,d0
5860 _NEON2SSE_INLINE uint16x8_t
vabdl_u8(uint8x8_t a
, uint8x8_t b
)
5863 res
= vsubl_u8(a
,b
);
5864 return _mm_abs_epi16(res
);
5867 uint32x4_t
vabdl_u16(uint16x4_t a
, uint16x4_t b
); // VABDL.s16 q0,d0,d0
5868 _NEON2SSE_INLINE uint32x4_t
vabdl_u16(uint16x4_t a
, uint16x4_t b
)
5871 res
= vsubl_u16(a
,b
);
5872 return _mm_abs_epi32(res
);
5875 uint64x2_t
vabdl_u32(uint32x2_t a
, uint32x2_t b
); // VABDL.U32 q0,d0,d0
5876 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING (uint64x2_t
vabdl_u32(uint32x2_t a
, uint32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
5878 _NEON2SSE_ALIGN_16
uint64_t res
[2];
5879 if(a
.m64_u32
[0] > b
.m64_u32
[0]) res
[0] = ( uint64_t) a
.m64_u32
[0] - ( uint64_t) b
.m64_u32
[0];
5880 else res
[0] = ( uint64_t) b
.m64_u32
[0] - ( uint64_t) a
.m64_u32
[0];
5881 if(a
.m64_u32
[1] > b
.m64_u32
[1]) res
[1] = ( uint64_t) a
.m64_u32
[1] - ( uint64_t) b
.m64_u32
[1];
5882 else res
[1] = ( uint64_t) b
.m64_u32
[1] - ( uint64_t) a
.m64_u32
[1];
5883 return _mm_load_si128((__m128i
*)res
);
5886 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
5887 //*********************************************************************************************
5888 int8x8_t
vaba_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VABA.S8 d0,d0,d0
5889 _NEON2SSE_INLINE int8x8_t
vaba_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
)
5892 return64(vabaq_s8(_pM128i(a
),_pM128i(b
), _pM128i(c
)));
5895 int16x4_t
vaba_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
); // VABA.S16 d0,d0,d0
5896 _NEON2SSE_INLINE int16x4_t
vaba_s16(int16x4_t a
, int16x4_t b
, int16x4_t c
)
5899 return64(vabaq_s16(_pM128i(a
), _pM128i(b
), _pM128i(c
)));
5902 int32x2_t
vaba_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
); // VABA.S32 d0,d0,d0
5903 _NEON2SSE_INLINE int32x2_t
vaba_s32(int32x2_t a
, int32x2_t b
, int32x2_t c
)
5906 return64(vabaq_s32(_pM128i(a
), _pM128i(b
), _pM128i(c
)));
5909 uint8x8_t
vaba_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VABA.U8 d0,d0,d0
5910 #define vaba_u8 vaba_s8
5913 uint16x4_t
vaba_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VABA.s16 d0,d0,d0
5914 #define vaba_u16 vaba_s16
5916 uint32x2_t
vaba_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VABA.U32 d0,d0,d0
5917 _NEON2SSE_INLINE uint32x2_t
vaba_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
)
5920 return64(vabaq_u32(_pM128i(a
), _pM128i(b
), _pM128i(c
)));
5923 int8x16_t
vabaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
); // VABA.S8 q0,q0,q0
5924 _NEON2SSE_INLINE int8x16_t
vabaq_s8(int8x16_t a
, int8x16_t b
, int8x16_t c
) // VABA.S8 q0,q0,q0
5927 sub
= vabdq_s8(b
, c
);
5928 return vaddq_s8( a
, sub
);
5931 int16x8_t
vabaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
); // VABA.S16 q0,q0,q0
5932 _NEON2SSE_INLINE int16x8_t
vabaq_s16(int16x8_t a
, int16x8_t b
, int16x8_t c
) // VABA.S16 q0,q0,q0
5935 sub
= vabdq_s16(b
, c
);
5936 return vaddq_s16( a
, sub
);
5939 int32x4_t
vabaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
); // VABA.S32 q0,q0,q0
5940 _NEON2SSE_INLINE int32x4_t
vabaq_s32(int32x4_t a
, int32x4_t b
, int32x4_t c
) // VABA.S32 q0,q0,q0
5943 sub
= vabdq_s32(b
, c
);
5944 return vaddq_s32( a
, sub
);
5947 uint8x16_t
vabaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VABA.U8 q0,q0,q0
5948 _NEON2SSE_INLINE uint8x16_t
vabaq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
)
5951 sub
= vabdq_u8(b
, c
);
5952 return vaddq_u8( a
, sub
);
5955 uint16x8_t
vabaq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VABA.s16 q0,q0,q0
5956 _NEON2SSE_INLINE uint16x8_t
vabaq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
)
5959 sub
= vabdq_u16(b
, c
);
5960 return vaddq_u16( a
, sub
);
5963 uint32x4_t
vabaq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VABA.U32 q0,q0,q0
5964 _NEON2SSE_INLINE uint32x4_t
vabaq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
)
5967 sub
= vabdq_u32(b
, c
);
5968 return vaddq_u32( a
, sub
);
5971 //************** Absolute difference and accumulate - long ********************************
5972 //*************************************************************************************
5973 int16x8_t
vabal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
); // VABAL.S8 q0,d0,d0
5974 _NEON2SSE_INLINE int16x8_t
vabal_s8(int16x8_t a
, int8x8_t b
, int8x8_t c
) // VABAL.S8 q0,d0,d0
5976 __m128i b16
, c16
, res
;
5977 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); //SSE4.1,
5978 c16
= _MM_CVTEPI8_EPI16 (_pM128i(c
)); //SSE4.1,
5979 res
= _mm_abs_epi16 (_mm_sub_epi16 (b16
, c16
) );
5980 return _mm_add_epi16 (a
, res
);
5983 int32x4_t
vabal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
); // VABAL.S16 q0,d0,d0
5984 _NEON2SSE_INLINE int32x4_t
vabal_s16(int32x4_t a
, int16x4_t b
, int16x4_t c
) // VABAL.S16 q0,d0,d0
5986 __m128i b32
, c32
, res
;
5987 b32
= _MM_CVTEPI16_EPI32(_pM128i(b
)); //SSE4.1
5988 c32
= _MM_CVTEPI16_EPI32(_pM128i(c
)); //SSE4.1
5989 res
= _mm_abs_epi32 (_mm_sub_epi32 (b32
, c32
) );
5990 return _mm_add_epi32 (a
, res
);
5993 int64x2_t
vabal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
); // VABAL.S32 q0,d0,d0
5994 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING (int64x2_t
vabal_s32(int64x2_t a
, int32x2_t b
, int32x2_t c
), _NEON2SSE_REASON_SLOW_SERIAL
)
5997 res
= vabdl_s32(b
,c
);
5998 return _mm_add_epi64(a
, res
);
6001 uint16x8_t
vabal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
); // VABAL.U8 q0,d0,d0
6002 _NEON2SSE_INLINE uint16x8_t
vabal_u8(uint16x8_t a
, uint8x8_t b
, uint8x8_t c
)
6004 __m128i b16
, c16
, res
;
6005 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); //SSE4.1,
6006 c16
= _MM_CVTEPU8_EPI16 (_pM128i(c
)); //SSE4.1,
6007 res
= _mm_abs_epi16 (_mm_sub_epi16 (b16
, c16
) );
6008 return _mm_add_epi16 (a
, res
);
6011 uint32x4_t
vabal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
); // VABAL.s16 q0,d0,d0
6012 _NEON2SSE_INLINE uint32x4_t
vabal_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t c
)
6014 __m128i b32
, c32
, res
;
6015 b32
= _MM_CVTEPU16_EPI32(_pM128i(b
)); //SSE4.1
6016 c32
= _MM_CVTEPU16_EPI32(_pM128i(c
)); //SSE4.1
6017 res
= _mm_abs_epi32 (_mm_sub_epi32 (b32
, c32
) );
6018 return _mm_add_epi32 (a
, res
);
6021 uint64x2_t
vabal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
); // VABAL.U32 q0,d0,d0
6022 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING (uint64x2_t
vabal_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t c
), _NEON2SSE_REASON_SLOW_SERIAL
)
6025 res
= vabdl_u32(b
,c
);
6026 return _mm_add_epi64(a
, res
);
6029 //***********************************************************************************
6030 //**************** Maximum and minimum operations **********************************
6031 //***********************************************************************************
6032 //************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] *******
6033 //***********************************************************************************
6034 int8x8_t
vmax_s8(int8x8_t a
, int8x8_t b
); // VMAX.S8 d0,d0,d0
6035 _NEON2SSE_INLINE int8x8_t
vmax_s8(int8x8_t a
, int8x8_t b
)
6039 res
= _MM_MAX_EPI8(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits
6043 int16x4_t
vmax_s16(int16x4_t a
, int16x4_t b
); // VMAX.S16 d0,d0,d0
6044 _NEON2SSE_INLINE int16x4_t
vmax_s16(int16x4_t a
, int16x4_t b
)
6047 return64(_mm_max_epi16(_pM128i(a
),_pM128i(b
)));
6050 int32x2_t
vmax_s32(int32x2_t a
, int32x2_t b
); // VMAX.S32 d0,d0,d0
6051 _NEON2SSE_INLINE int32x2_t
vmax_s32(int32x2_t a
, int32x2_t b
)
6055 res
= _MM_MAX_EPI32(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits
6059 uint8x8_t
vmax_u8(uint8x8_t a
, uint8x8_t b
); // VMAX.U8 d0,d0,d0
6060 _NEON2SSE_INLINE uint8x8_t
vmax_u8(uint8x8_t a
, uint8x8_t b
)
6063 return64(_mm_max_epu8(_pM128i(a
),_pM128i(b
)));
6067 uint16x4_t
vmax_u16(uint16x4_t a
, uint16x4_t b
); // VMAX.s16 d0,d0,d0
6068 _NEON2SSE_INLINE uint16x4_t
vmax_u16(uint16x4_t a
, uint16x4_t b
)
6071 return64(_MM_MAX_EPU16(_pM128i(a
),_pM128i(b
)));
6075 uint32x2_t
vmax_u32(uint32x2_t a
, uint32x2_t b
); // VMAX.U32 d0,d0,d0
6076 _NEON2SSE_INLINE uint32x2_t
vmax_u32(uint32x2_t a
, uint32x2_t b
)
6080 res
= _MM_MAX_EPU32(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6084 float32x2_t
vmax_f32(float32x2_t a
, float32x2_t b
); // VMAX.F32 d0,d0,d0
6085 _NEON2SSE_INLINE float32x2_t
vmax_f32(float32x2_t a
, float32x2_t b
)
6087 //serial solution looks faster than SIMD one
6089 res
.m64_f32
[0] = (a
.m64_f32
[0] > b
.m64_f32
[0]) ? a
.m64_f32
[0] : b
.m64_f32
[0];
6090 res
.m64_f32
[1] = (a
.m64_f32
[1] > b
.m64_f32
[1]) ? a
.m64_f32
[1] : b
.m64_f32
[1];
6094 int8x16_t
vmaxq_s8(int8x16_t a
, int8x16_t b
); // VMAX.S8 q0,q0,q0
6095 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
6097 int16x8_t
vmaxq_s16(int16x8_t a
, int16x8_t b
); // VMAX.S16 q0,q0,q0
6098 #define vmaxq_s16 _mm_max_epi16
6100 int32x4_t
vmaxq_s32(int32x4_t a
, int32x4_t b
); // VMAX.S32 q0,q0,q0
6101 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
6103 uint8x16_t
vmaxq_u8(uint8x16_t a
, uint8x16_t b
); // VMAX.U8 q0,q0,q0
6104 #define vmaxq_u8 _mm_max_epu8
6106 uint16x8_t
vmaxq_u16(uint16x8_t a
, uint16x8_t b
); // VMAX.s16 q0,q0,q0
6107 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
6109 uint32x4_t
vmaxq_u32(uint32x4_t a
, uint32x4_t b
); // VMAX.U32 q0,q0,q0
6110 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
6113 float32x4_t
vmaxq_f32(float32x4_t a
, float32x4_t b
); // VMAX.F32 q0,q0,q0
6114 #define vmaxq_f32 _mm_max_ps
6116 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
6117 //***********************************************************************************************************
6118 int8x8_t
vmin_s8(int8x8_t a
, int8x8_t b
); // VMIN.S8 d0,d0,d0
6119 _NEON2SSE_INLINE int8x8_t
vmin_s8(int8x8_t a
, int8x8_t b
)
6123 res
= _MM_MIN_EPI8(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits
6127 int16x4_t
vmin_s16(int16x4_t a
, int16x4_t b
); // VMIN.S16 d0,d0,d0
6128 _NEON2SSE_INLINE int16x4_t
vmin_s16(int16x4_t a
, int16x4_t b
)
6131 return64(_mm_min_epi16(_pM128i(a
),_pM128i(b
)));
6135 int32x2_t
vmin_s32(int32x2_t a
, int32x2_t b
); // VMIN.S32 d0,d0,d0
6136 _NEON2SSE_INLINE int32x2_t
vmin_s32(int32x2_t a
, int32x2_t b
)
6140 res
= _MM_MIN_EPI32(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits
6144 uint8x8_t
vmin_u8(uint8x8_t a
, uint8x8_t b
); // VMIN.U8 d0,d0,d0
6145 _NEON2SSE_INLINE uint8x8_t
vmin_u8(uint8x8_t a
, uint8x8_t b
)
6148 return64(_mm_min_epu8(_pM128i(a
),_pM128i(b
)));
6152 uint16x4_t
vmin_u16(uint16x4_t a
, uint16x4_t b
); // VMIN.s16 d0,d0,d0
6153 _NEON2SSE_INLINE uint16x4_t
vmin_u16(uint16x4_t a
, uint16x4_t b
)
6156 return64(_MM_MIN_EPU16(_pM128i(a
),_pM128i(b
)));
6160 uint32x2_t
vmin_u32(uint32x2_t a
, uint32x2_t b
); // VMIN.U32 d0,d0,d0
6161 _NEON2SSE_INLINE uint32x2_t
vmin_u32(uint32x2_t a
, uint32x2_t b
)
6165 res
= _MM_MIN_EPU32(_pM128i(a
),_pM128i(b
)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
6169 float32x2_t
vmin_f32(float32x2_t a
, float32x2_t b
); // VMIN.F32 d0,d0,d0
6170 _NEON2SSE_INLINE float32x2_t
vmin_f32(float32x2_t a
, float32x2_t b
)
6172 //serial solution looks faster than SIMD one
6174 res
.m64_f32
[0] = (a
.m64_f32
[0] < b
.m64_f32
[0]) ? a
.m64_f32
[0] : b
.m64_f32
[0];
6175 res
.m64_f32
[1] = (a
.m64_f32
[1] < b
.m64_f32
[1]) ? a
.m64_f32
[1] : b
.m64_f32
[1];
6179 int8x16_t
vminq_s8(int8x16_t a
, int8x16_t b
); // VMIN.S8 q0,q0,q0
6180 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
6182 int16x8_t
vminq_s16(int16x8_t a
, int16x8_t b
); // VMIN.S16 q0,q0,q0
6183 #define vminq_s16 _mm_min_epi16
6185 int32x4_t
vminq_s32(int32x4_t a
, int32x4_t b
); // VMIN.S32 q0,q0,q0
6186 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
6188 uint8x16_t
vminq_u8(uint8x16_t a
, uint8x16_t b
); // VMIN.U8 q0,q0,q0
6189 #define vminq_u8 _mm_min_epu8
6191 uint16x8_t
vminq_u16(uint16x8_t a
, uint16x8_t b
); // VMIN.s16 q0,q0,q0
6192 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
6194 uint32x4_t
vminq_u32(uint32x4_t a
, uint32x4_t b
); // VMIN.U32 q0,q0,q0
6195 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
6197 float32x4_t
vminq_f32(float32x4_t a
, float32x4_t b
); // VMIN.F32 q0,q0,q0
6198 #define vminq_f32 _mm_min_ps
6200 //************* Pairwise addition operations. **************************************
6201 //************************************************************************************
6202 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
6203 int8x8_t
vpadd_s8(int8x8_t a
, int8x8_t b
); // VPADD.I8 d0,d0,d0
6204 _NEON2SSE_INLINE int8x8_t
vpadd_s8(int8x8_t a
, int8x8_t b
) // VPADD.I8 d0,d0,d0
6206 //no 8 bit hadd in IA32, need to go to 16 bit and then pack
6208 __m128i a16
, b16
, res
;
6209 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
6210 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); // SSE 4.1
6211 b16
= _MM_CVTEPI8_EPI16 (_pM128i(b
)); // SSE 4.1
6212 res
= _mm_hadd_epi16 (a16
, b16
);
6213 res
= _mm_shuffle_epi8 (res
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bit, use low 64 bits
6217 int16x4_t
vpadd_s16(int16x4_t a
, int16x4_t b
); // VPADD.I16 d0,d0,d0
6218 _NEON2SSE_INLINE int16x4_t
vpadd_s16(int16x4_t a
, int16x4_t b
)
6222 hadd128
= _mm_hadd_epi16 (_pM128i(a
), _pM128i(b
));
6223 hadd128
= _mm_shuffle_epi32 (hadd128
, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6228 int32x2_t
vpadd_s32(int32x2_t a
, int32x2_t b
); // VPADD.I32 d0,d0,d0
6229 _NEON2SSE_INLINE int32x2_t
vpadd_s32(int32x2_t a
, int32x2_t b
)
6233 hadd128
= _mm_hadd_epi32 (_pM128i(a
), _pM128i(b
));
6234 hadd128
= _mm_shuffle_epi32 (hadd128
, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6239 uint8x8_t
vpadd_u8(uint8x8_t a
, uint8x8_t b
); // VPADD.I8 d0,d0,d0
6240 _NEON2SSE_INLINE uint8x8_t
vpadd_u8(uint8x8_t a
, uint8x8_t b
) // VPADD.I8 d0,d0,d0
6242 // no 8 bit hadd in IA32, need to go to 16 bit and then pack
6244 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6245 __m128i mask8
, a16
, b16
, res
;
6246 mask8
= _mm_set1_epi16(0xff);
6247 a16
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); // SSE 4.1
6248 b16
= _MM_CVTEPU8_EPI16 (_pM128i(b
)); // SSE 4.1
6249 res
= _mm_hadd_epi16 (a16
, b16
);
6250 res
= _mm_and_si128(res
, mask8
); //to avoid saturation
6251 res
= _mm_packus_epi16 (res
,res
); //use low 64 bits
6255 uint16x4_t
vpadd_u16(uint16x4_t a
, uint16x4_t b
); // VPADD.I16 d0,d0,d0
6256 _NEON2SSE_INLINE uint16x4_t
vpadd_u16(uint16x4_t a
, uint16x4_t b
) // VPADD.I16 d0,d0,d0
6258 // solution may be not optimal, serial execution may be faster
6259 // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
6261 __m128i c32767
, cfffe
, as
, bs
, res
;
6262 c32767
= _mm_set1_epi16 (32767);
6263 cfffe
= _mm_set1_epi16 (0xfffe);
6264 as
= _mm_sub_epi16 (_pM128i(a
), c32767
);
6265 bs
= _mm_sub_epi16 (_pM128i(b
), c32767
);
6266 res
= _mm_hadd_epi16 (as
, bs
);
6267 res
= _mm_add_epi16 (res
, cfffe
);
6268 res
= _mm_shuffle_epi32 (res
, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6272 uint32x2_t
vpadd_u32(uint32x2_t a
, uint32x2_t b
); // VPADD.I32 d0,d0,d0
6273 _NEON2SSE_INLINE uint32x2_t
vpadd_u32(uint32x2_t a
, uint32x2_t b
) //serial may be faster
6275 //hadd doesn't work for unsigned values
6277 __m128i ab
, ab_sh
, res
;
6278 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //a0 a1 b0 b1
6279 ab_sh
= _mm_shuffle_epi32(ab
, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
6280 res
= _mm_add_epi32(ab
, ab_sh
);
6281 res
= _mm_shuffle_epi32(res
, 0 | (2 << 2) | (1 << 4) | (3 << 6));
6285 float32x2_t
vpadd_f32(float32x2_t a
, float32x2_t b
); // VPADD.F32 d0,d0,d0
6286 _NEON2SSE_INLINE float32x2_t
vpadd_f32(float32x2_t a
, float32x2_t b
)
6290 hadd128
= _mm_hadd_ps (_pM128(a
), _pM128(b
));
6291 hadd128
= _mm_shuffle_ps (hadd128
, hadd128
, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
6292 _M64f(res64
, hadd128
);
6297 //************************** Long pairwise add **********************************
6298 //*********************************************************************************
6299 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
6300 // and places the final results in the destination vector.
6302 int16x4_t
vpaddl_s8(int8x8_t a
); // VPADDL.S8 d0,d0
6303 _NEON2SSE_INLINE int16x4_t
vpaddl_s8(int8x8_t a
) // VPADDL.S8 d0,d0
6305 //no 8 bit hadd in IA32, need to go to 16 bit anyway
6308 a16
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); // SSE 4.1
6309 a16
= _mm_hadd_epi16 (a16
, a16
); //use low 64 bits
6313 int32x2_t
vpaddl_s16(int16x4_t a
); // VPADDL.S16 d0,d0
6314 _NEON2SSE_INLINE int32x2_t
vpaddl_s16(int16x4_t a
) // VPADDL.S16 d0,d0
6316 // solution may be not optimal, serial execution may be faster
6319 r32_1
= _MM_CVTEPI16_EPI32 (_pM128i(a
));
6320 r32_1
= _mm_hadd_epi32(r32_1
, r32_1
); //use low 64 bits
6324 int64x1_t
vpaddl_s32(int32x2_t a
); // VPADDL.S32 d0,d0
6325 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vpaddl_s32(int32x2_t a
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution looks faster
6328 res
.m64_i64
[0] = (int64_t)a
.m64_i32
[0] + (int64_t)a
.m64_i32
[1];
6332 uint16x4_t
vpaddl_u8(uint8x8_t a
); // VPADDL.U8 d0,d0
6333 _NEON2SSE_INLINE uint16x4_t
vpaddl_u8(uint8x8_t a
) // VPADDL.U8 d0,d0
6335 // no 8 bit hadd in IA32, need to go to 16 bit
6336 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
6339 a16
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); // SSE 4.1 use low 64 bits
6340 a16
= _mm_hadd_epi16 (a16
, a16
); //use low 64 bits
6344 uint32x2_t
vpaddl_u16(uint16x4_t a
); // VPADDL.s16 d0,d0
6345 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vpaddl_u16(uint16x4_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6347 //serial solution looks faster than a SIMD one
6349 res
.m64_u32
[0] = (uint32_t)a
.m64_u16
[0] + (uint32_t)a
.m64_u16
[1];
6350 res
.m64_u32
[1] = (uint32_t)a
.m64_u16
[2] + (uint32_t)a
.m64_u16
[3];
6354 uint64x1_t
vpaddl_u32(uint32x2_t a
); // VPADDL.U32 d0,d0
6355 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vpaddl_u32(uint32x2_t a
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution looks faster
6358 res
.m64_u64
[0] = (uint64_t)a
.m64_u32
[0] + (uint64_t)a
.m64_u32
[1];
6362 int16x8_t
vpaddlq_s8(int8x16_t a
); // VPADDL.S8 q0,q0
6363 _NEON2SSE_INLINE int16x8_t
vpaddlq_s8(int8x16_t a
) // VPADDL.S8 q0,q0
6365 //no 8 bit hadd in IA32, need to go to 16 bit
6366 __m128i r16_1
, r16_2
;
6367 r16_1
= _MM_CVTEPI8_EPI16 (a
); // SSE 4.1
6368 //swap hi and low part of r to process the remaining data
6369 r16_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
6370 r16_2
= _MM_CVTEPI8_EPI16 (r16_2
);
6371 return _mm_hadd_epi16 (r16_1
, r16_2
);
6374 int32x4_t
vpaddlq_s16(int16x8_t a
); // VPADDL.S16 q0,q0
6375 _NEON2SSE_INLINE int32x4_t
vpaddlq_s16(int16x8_t a
) // VPADDL.S16 q0,q0
6377 //no 8 bit hadd in IA32, need to go to 16 bit
6378 __m128i r32_1
, r32_2
;
6379 r32_1
= _MM_CVTEPI16_EPI32(a
);
6380 //swap hi and low part of r to process the remaining data
6381 r32_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
6382 r32_2
= _MM_CVTEPI16_EPI32 (r32_2
);
6383 return _mm_hadd_epi32 (r32_1
, r32_2
);
6386 int64x2_t
vpaddlq_s32(int32x4_t a
); // VPADDL.S32 q0,q0
6387 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vpaddlq_s32(int32x4_t a
), _NEON2SSE_REASON_SLOW_SERIAL
) // VPADDL.S32 q0,q0
6389 _NEON2SSE_ALIGN_16
int32_t atmp
[4];
6390 _NEON2SSE_ALIGN_16
int64_t res
[2];
6391 _mm_store_si128((__m128i
*)atmp
, a
);
6392 res
[0] = (int64_t)atmp
[0] + (int64_t)atmp
[1];
6393 res
[1] = (int64_t)atmp
[2] + (int64_t)atmp
[3];
6394 return _mm_load_si128((__m128i
*)res
);
6397 uint16x8_t
vpaddlq_u8(uint8x16_t a
); // VPADDL.U8 q0,q0
6398 _NEON2SSE_INLINE uint16x8_t
vpaddlq_u8(uint8x16_t a
) // VPADDL.U8 q0,q0
6400 //no 8 bit hadd in IA32, need to go to 16 bit
6401 __m128i r16_1
, r16_2
;
6402 r16_1
= _MM_CVTEPU8_EPI16(a
);
6403 //swap hi and low part of r to process the remaining data
6404 r16_2
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
6405 r16_2
= _MM_CVTEPU8_EPI16 (r16_2
);
6406 return _mm_hadd_epi16 (r16_1
, r16_2
);
6409 uint32x4_t
vpaddlq_u16(uint16x8_t a
); // VPADDL.s16 q0,q0
6410 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vpaddlq_u16(uint16x8_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6412 //serial solution looks faster than a SIMD one
6413 _NEON2SSE_ALIGN_16
uint16_t atmp
[8];
6414 _NEON2SSE_ALIGN_16
uint32_t res
[4];
6415 _mm_store_si128((__m128i
*)atmp
, a
);
6416 res
[0] = (uint32_t)atmp
[0] + (uint32_t)atmp
[1];
6417 res
[1] = (uint32_t)atmp
[2] + (uint32_t)atmp
[3];
6418 res
[2] = (uint32_t)atmp
[4] + (uint32_t)atmp
[5];
6419 res
[3] = (uint32_t)atmp
[6] + (uint32_t)atmp
[7];
6420 return _mm_load_si128((__m128i
*)res
);
6423 uint64x2_t
vpaddlq_u32(uint32x4_t a
); // VPADDL.U32 q0,q0
6424 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vpaddlq_u32(uint32x4_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6426 _NEON2SSE_ALIGN_16
uint32_t atmp
[4];
6427 _NEON2SSE_ALIGN_16
uint64_t res
[2];
6428 _mm_store_si128((__m128i
*)atmp
, a
);
6429 res
[0] = (uint64_t)atmp
[0] + (uint64_t)atmp
[1];
6430 res
[1] = (uint64_t)atmp
[2] + (uint64_t)atmp
[3];
6431 return _mm_load_si128((__m128i
*)res
);
6434 //************************ Long pairwise add and accumulate **************************
6435 //****************************************************************************************
6436 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
6437 // and accumulates the values of the results into the elements of the destination (wide) vector
6438 int16x4_t
vpadal_s8(int16x4_t a
, int8x8_t b
); // VPADAL.S8 d0,d0
6439 _NEON2SSE_INLINE int16x4_t
vpadal_s8(int16x4_t a
, int8x8_t b
)
6442 return64(vpadalq_s8(_pM128i(a
), _pM128i(b
)));
6445 int32x2_t
vpadal_s16(int32x2_t a
, int16x4_t b
); // VPADAL.S16 d0,d0
6446 _NEON2SSE_INLINE int32x2_t
vpadal_s16(int32x2_t a
, int16x4_t b
)
6449 return64(vpadalq_s16(_pM128i(a
), _pM128i(b
)));
6453 int64x1_t
vpadal_s32(int64x1_t a
, int32x2_t b
); // VPADAL.S32 d0,d0
6454 _NEON2SSE_INLINE int64x1_t
vpadal_s32(int64x1_t a
, int32x2_t b
)
6457 res
.m64_i64
[0] = (int64_t)b
.m64_i32
[0] + (int64_t)b
.m64_i32
[1] + a
.m64_i64
[0];
6461 uint16x4_t
vpadal_u8(uint16x4_t a
, uint8x8_t b
); // VPADAL.U8 d0,d0
6462 _NEON2SSE_INLINE uint16x4_t
vpadal_u8(uint16x4_t a
, uint8x8_t b
)
6465 return64(vpadalq_u8(_pM128i(a
), _pM128i(b
)));
6469 uint32x2_t
vpadal_u16(uint32x2_t a
, uint16x4_t b
); // VPADAL.s16 d0,d0
6470 _NEON2SSE_INLINE uint32x2_t
vpadal_u16(uint32x2_t a
, uint16x4_t b
)
6473 return64(vpadalq_u16(_pM128i(a
), _pM128i(b
)));
6476 uint64x1_t
vpadal_u32(uint64x1_t a
, uint32x2_t b
); // VPADAL.U32 d0,d0
6477 _NEON2SSE_INLINE uint64x1_t
vpadal_u32(uint64x1_t a
, uint32x2_t b
)
6480 res
.m64_u64
[0] = (uint64_t)b
.m64_u32
[0] + (uint64_t)b
.m64_u32
[1] + a
.m64_u64
[0];
6484 int16x8_t
vpadalq_s8(int16x8_t a
, int8x16_t b
); // VPADAL.S8 q0,q0
6485 _NEON2SSE_INLINE int16x8_t
vpadalq_s8(int16x8_t a
, int8x16_t b
) // VPADAL.S8 q0,q0
6488 pad
= vpaddlq_s8(b
);
6489 return _mm_add_epi16 (a
, pad
);
6492 int32x4_t
vpadalq_s16(int32x4_t a
, int16x8_t b
); // VPADAL.S16 q0,q0
6493 _NEON2SSE_INLINE int32x4_t
vpadalq_s16(int32x4_t a
, int16x8_t b
) // VPADAL.S16 q0,q0
6496 pad
= vpaddlq_s16(b
);
6497 return _mm_add_epi32(a
, pad
);
6500 int64x2_t
vpadalq_s32(int64x2_t a
, int32x4_t b
); // VPADAL.S32 q0,q0
6501 _NEON2SSE_INLINE int64x2_t
vpadalq_s32(int64x2_t a
, int32x4_t b
)
6504 pad
= vpaddlq_s32(b
);
6505 return _mm_add_epi64 (a
, pad
);
6508 uint16x8_t
vpadalq_u8(uint16x8_t a
, uint8x16_t b
); // VPADAL.U8 q0,q0
6509 _NEON2SSE_INLINE uint16x8_t
vpadalq_u8(uint16x8_t a
, uint8x16_t b
) // VPADAL.U8 q0,q0
6512 pad
= vpaddlq_u8(b
);
6513 return _mm_add_epi16 (a
, pad
);
6516 uint32x4_t
vpadalq_u16(uint32x4_t a
, uint16x8_t b
); // VPADAL.s16 q0,q0
6517 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vpadalq_u16(uint32x4_t a
, uint16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6520 pad
= vpaddlq_u16(b
);
6521 return _mm_add_epi32(a
, pad
);
6522 } //no optimal SIMD solution, serial is faster
6524 uint64x2_t
vpadalq_u32(uint64x2_t a
, uint32x4_t b
); // VPADAL.U32 q0,q0
6525 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vpadalq_u32(uint64x2_t a
, uint32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6527 //no optimal SIMD solution, serial is faster
6529 pad
= vpaddlq_u32(b
);
6530 return _mm_add_epi64(a
, pad
);
6531 } //no optimal SIMD solution, serial is faster
6533 //********** Folding maximum *************************************
6534 //*******************************************************************
6535 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
6536 //and copies the larger of each pair into the corresponding element in the destination
6537 // no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
6538 int8x8_t
vpmax_s8(int8x8_t a
, int8x8_t b
); // VPMAX.S8 d0,d0,d0
6539 _NEON2SSE_INLINE int8x8_t
vpmax_s8(int8x8_t a
, int8x8_t b
) // VPMAX.S8 d0,d0,d0
6542 __m128i ab
, ab1
, max
;
6543 _NEON2SSE_ALIGN_16
uint8_t mask8_sab
[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6544 _NEON2SSE_ALIGN_16
uint8_t mask8_odd
[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6545 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6546 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask8_sab
); //horisontal pairs swap for vertical max finding
6547 max
= _MM_MAX_EPI8 (ab
, ab1
); // SSE4.1
6548 max
= _mm_shuffle_epi8 (max
, *(__m128i
*) mask8_odd
); //remove repetitive data
6549 return64(max
); //we need 64 bits only
6552 int16x4_t
vpmax_s16(int16x4_t a
, int16x4_t b
); // VPMAX.S16 d0,d0,d0
6553 _NEON2SSE_INLINE int16x4_t
vpmax_s16(int16x4_t a
, int16x4_t b
) // VPMAX.S16 d0,d0,d0
6555 //solution may be not optimal compared with the serial one
6557 __m128i ab
, ab1
, max
;
6558 _NEON2SSE_ALIGN_16
int8_t mask16_sab
[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6559 _NEON2SSE_ALIGN_16
int8_t mask16_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6560 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6561 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask16_sab
); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6562 max
= _mm_max_epi16 (ab
, ab1
);
6563 max
= _mm_shuffle_epi8 (max
, *(__m128i
*) mask16_odd
); //remove repetitive data, use 8bit fn and the corresponding mask
6567 int32x2_t
vpmax_s32(int32x2_t a
, int32x2_t b
); // VPMAX.S32 d0,d0,d0
6568 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vpmax_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6570 //serial solution looks faster than SIMD one
6572 res
.m64_i32
[0] = (a
.m64_i32
[0] < a
.m64_i32
[1]) ? a
.m64_i32
[1] : a
.m64_i32
[0];
6573 res
.m64_i32
[1] = (b
.m64_i32
[0] < b
.m64_i32
[1]) ? b
.m64_i32
[1] : b
.m64_i32
[0];
6577 uint8x8_t
vpmax_u8(uint8x8_t a
, uint8x8_t b
); // VPMAX.U8 d0,d0,d0
6578 _NEON2SSE_INLINE uint8x8_t
vpmax_u8(uint8x8_t a
, uint8x8_t b
) // VPMAX.U8 d0,d0,d0
6581 __m128i ab
, ab1
, max
;
6582 _NEON2SSE_ALIGN_16
int8_t mask8_sab
[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6583 _NEON2SSE_ALIGN_16
uint8_t mask8_odd
[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6584 ab
= _mm_unpacklo_epi64 (_pM128i(a
), _pM128i(b
)); //ab
6585 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask8_sab
); //horisontal pairs swap for vertical max finding
6586 max
= _mm_max_epu8 (ab
, ab1
); // SSE4.1
6587 max
= _mm_shuffle_epi8 (max
, *(__m128i
*) mask8_odd
); //remove repetitive data
6591 uint16x4_t
vpmax_u16(uint16x4_t a
, uint16x4_t b
); // VPMAX.s16 d0,d0,d0
6592 _NEON2SSE_INLINE uint16x4_t
vpmax_u16(uint16x4_t a
, uint16x4_t b
) // VPMAX.s16 d0,d0,d0
6594 //solution may be not optimal compared with the serial one
6596 __m128i ab
, ab1
, max
;
6597 _NEON2SSE_ALIGN_16
uint8_t mask16_sab
[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6598 _NEON2SSE_ALIGN_16
uint8_t mask16_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6599 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6600 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask16_sab
); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6601 max
= _MM_MAX_EPU16 (ab
, ab1
);
6602 max
= _mm_shuffle_epi8 (max
, *(__m128i
*) mask16_odd
); //remove repetitive data, use 8bit fn and the corresponding mask
6606 uint32x2_t
vpmax_u32(uint32x2_t a
, uint32x2_t b
); // VPMAX.U32 d0,d0,d0
6607 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vpmax_u32(uint32x2_t a
, uint32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6609 //serial solution looks faster than SIMD one
6611 res
.m64_i32
[0] = (a
.m64_i32
[0] < a
.m64_i32
[1]) ? a
.m64_i32
[1] : a
.m64_i32
[0];
6612 res
.m64_i32
[1] = (b
.m64_i32
[0] < b
.m64_i32
[1]) ? b
.m64_i32
[1] : b
.m64_i32
[0];
6614 } //serial solution looks faster than a SIMD one
6616 float32x2_t
vpmax_f32(float32x2_t a
, float32x2_t b
); // VPMAX.F32 d0,d0,d0
6617 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t
vpmax_f32(float32x2_t a
, float32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6619 //serial solution looks faster than SIMD one
6621 res
.m64_f32
[0] = (a
.m64_f32
[0] < a
.m64_f32
[1]) ? a
.m64_f32
[1] : a
.m64_f32
[0];
6622 res
.m64_f32
[1] = (b
.m64_f32
[0] < b
.m64_f32
[1]) ? b
.m64_f32
[1] : b
.m64_f32
[0];
6626 // ***************** Folding minimum ****************************
6627 // **************************************************************
6628 //vpmin -> takes minimum of adjacent pairs
6629 int8x8_t
vpmin_s8(int8x8_t a
, int8x8_t b
); // VPMIN.S8 d0,d0,d0
6630 _NEON2SSE_INLINE int8x8_t
vpmin_s8(int8x8_t a
, int8x8_t b
) // VPMIN.S8 d0,d0,d0
6633 __m128i ab
, ab1
, min
;
6634 _NEON2SSE_ALIGN_16
uint8_t mask8_sab
[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6635 _NEON2SSE_ALIGN_16
uint8_t mask8_odd
[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6636 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6637 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask8_sab
); //horisontal pairs swap for vertical min finding
6638 min
= _MM_MIN_EPI8 (ab
, ab1
); // SSE4.1
6639 min
= _mm_shuffle_epi8 (min
, *(__m128i
*) mask8_odd
); //remove repetitive data
6643 int16x4_t
vpmin_s16(int16x4_t a
, int16x4_t b
); // VPMIN.S16 d0,d0,d0
6644 _NEON2SSE_INLINE int16x4_t
vpmin_s16(int16x4_t a
, int16x4_t b
) // VPMIN.S16 d0,d0,d0
6646 //solution may be not optimal compared with the serial one
6648 __m128i ab
, ab1
, min
;
6649 _NEON2SSE_ALIGN_16
int8_t mask16_sab
[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6650 _NEON2SSE_ALIGN_16
int8_t mask16_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6651 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6652 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask16_sab
); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
6653 min
= _mm_min_epi16 (ab
, ab1
);
6654 min
= _mm_shuffle_epi8 (min
, *(__m128i
*) mask16_odd
); //remove repetitive data, use 8bit fn and the corresponding mask
6658 int32x2_t
vpmin_s32(int32x2_t a
, int32x2_t b
); // VPMIN.S32 d0,d0,d0
6659 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vpmin_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6661 //serial solution looks faster than SIMD one
6663 res
.m64_i32
[0] = (a
.m64_i32
[0] > a
.m64_i32
[1]) ? a
.m64_i32
[1] : a
.m64_i32
[0];
6664 res
.m64_i32
[1] = (b
.m64_i32
[0] > b
.m64_i32
[1]) ? b
.m64_i32
[1] : b
.m64_i32
[0];
6668 uint8x8_t
vpmin_u8(uint8x8_t a
, uint8x8_t b
); // VPMIN.U8 d0,d0,d0
6669 _NEON2SSE_INLINE uint8x8_t
vpmin_u8(uint8x8_t a
, uint8x8_t b
) // VPMIN.U8 d0,d0,d0
6672 __m128i ab
, ab1
, min
;
6673 _NEON2SSE_ALIGN_16
uint8_t mask8_sab
[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
6674 _NEON2SSE_ALIGN_16
uint8_t mask8_odd
[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6675 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6676 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask8_sab
); //horisontal pairs swap for vertical max finding
6677 min
= _mm_min_epu8 (ab
, ab1
); // SSE4.1
6678 min
= _mm_shuffle_epi8 (min
, *(__m128i
*) mask8_odd
); //remove repetitive data
6682 uint16x4_t
vpmin_u16(uint16x4_t a
, uint16x4_t b
); // VPMIN.s16 d0,d0,d0
6683 _NEON2SSE_INLINE uint16x4_t
vpmin_u16(uint16x4_t a
, uint16x4_t b
) // VPMIN.s16 d0,d0,d0
6685 //solution may be not optimal compared with the serial one
6687 __m128i ab
, ab1
, min
;
6688 _NEON2SSE_ALIGN_16
uint8_t mask16_sab
[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
6689 _NEON2SSE_ALIGN_16
uint8_t mask16_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
6690 ab
= _mm_unpacklo_epi64 ( _pM128i(a
), _pM128i(b
)); //ab
6691 ab1
= _mm_shuffle_epi8 (ab
, *(__m128i
*) mask16_sab
); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
6692 min
= _MM_MIN_EPU16 (ab
, ab1
);
6693 min
= _mm_shuffle_epi8 (min
, *(__m128i
*) mask16_odd
); //remove repetitive data, use 8bit fn and the corresponding mask
6697 uint32x2_t
vpmin_u32(uint32x2_t a
, uint32x2_t b
); // VPMIN.U32 d0,d0,d0
6698 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vpmin_u32(uint32x2_t a
, uint32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6700 //serial solution looks faster than SIMD one
6702 res
.m64_u32
[0] = (a
.m64_u32
[0] > a
.m64_u32
[1]) ? a
.m64_u32
[1] : a
.m64_u32
[0];
6703 res
.m64_u32
[1] = (b
.m64_u32
[0] > b
.m64_u32
[1]) ? b
.m64_u32
[1] : b
.m64_u32
[0];
6707 float32x2_t
vpmin_f32(float32x2_t a
, float32x2_t b
); // VPMIN.F32 d0,d0,d0
6708 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t
vpmin_f32(float32x2_t a
, float32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6710 //serial solution looks faster than SIMD one
6712 res
.m64_f32
[0] = (a
.m64_f32
[0] > a
.m64_f32
[1]) ? a
.m64_f32
[1] : a
.m64_f32
[0];
6713 res
.m64_f32
[1] = (b
.m64_f32
[0] > b
.m64_f32
[1]) ? b
.m64_f32
[1] : b
.m64_f32
[0];
6717 //***************************************************************
6718 //*********** Reciprocal/Sqrt ************************************
6719 //***************************************************************
6720 //****************** Reciprocal estimate *******************************
6721 //the ARM NEON and x86 SIMD results may be slightly different
6722 float32x2_t
vrecpe_f32(float32x2_t a
); // VRECPE.F32 d0,d0
6723 _NEON2SSE_INLINE float32x2_t
vrecpe_f32(float32x2_t a
) //use low 64 bits
6727 res
= _mm_rcp_ps(_pM128(a
));
6732 uint32x2_t
vrecpe_u32(uint32x2_t a
); // VRECPE.U32 d0,d0
6733 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vrecpe_u32(uint32x2_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6735 //Input is fixed point number!!! No reciprocal for ints in IA32 available
6739 for (i
=0; i
<2; i
++){
6740 if((a
.m64_u32
[i
] & 0x80000000) == 0) {
6741 res
.m64_u32
[i
] = 0xffffffff;
6743 resf
= (float) (a
.m64_u32
[i
] * (0.5f
/ (uint32_t)(1 << 31)));
6744 q
= (int)(resf
* 512.0); /* a in units of 1/512 rounded down */
6745 r
= 1.0 / (((float)q
+ 0.5) / 512.0); /* reciprocal r */
6746 s
= (int)(256.0 * r
+ 0.5); /* r in units of 1/256 rounded to nearest */
6747 r
= (float)s
/ 256.0;
6748 res
.m64_u32
[i
] = r
* (uint32_t)(1 << 31);
6754 float32x4_t
vrecpeq_f32(float32x4_t a
); // VRECPE.F32 q0,q0
6755 #define vrecpeq_f32 _mm_rcp_ps
6758 uint32x4_t
vrecpeq_u32(uint32x4_t a
); // VRECPE.U32 q0,q0
6759 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vrecpeq_u32(uint32x4_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6761 //Input is fixed point number!!!
6762 //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
6763 _NEON2SSE_ALIGN_16
uint32_t atmp
[4];
6764 _NEON2SSE_ALIGN_16
uint32_t res
[4];
6765 _NEON2SSE_ALIGN_16
int c80000000
[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
6768 __m128i res128
, mask
, zero
;
6769 _mm_store_si128((__m128i
*)atmp
, a
);
6770 zero
= _mm_setzero_si128();
6771 for (i
=0; i
<4; i
++){
6772 resf
= (atmp
[i
] * (0.5f
/ (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
6773 q
= (int)(resf
* 512.0); /* a in units of 1/512 rounded down */
6774 r
= 1.0 / (((float)q
+ 0.5) / 512.0); /* reciprocal r */
6775 s
= (int)(256.0 * r
+ 0.5); /* r in units of 1/256 rounded to nearest */
6776 r
= (float)s
/ 256.0;
6777 res
[i
] = (uint32_t) (r
* (((uint32_t)1) << 31) );
6779 res128
= _mm_load_si128((__m128i
*)res
);
6780 mask
= _mm_and_si128(a
, *(__m128i
*)c80000000
);
6781 mask
= _mm_cmpeq_epi32(zero
, mask
); //0xffffffff if atmp[i] <= 0x7fffffff
6782 return _mm_or_si128(res128
, mask
);
6785 //**********Reciprocal square root estimate ****************
6786 //**********************************************************
6787 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
6788 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
6789 ////the ARM NEON and x86 SIMD results may be slightly different
6790 float32x2_t
vrsqrte_f32(float32x2_t a
); // VRSQRTE.F32 d0,d0
6791 _NEON2SSE_INLINE float32x2_t
vrsqrte_f32(float32x2_t a
) //use low 64 bits
6795 res
= _mm_rsqrt_ps(_pM128(a
));
6800 uint32x2_t
vrsqrte_u32(uint32x2_t a
); // VRSQRTE.U32 d0,d0
6801 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vrsqrte_u32(uint32x2_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6803 //Input is fixed point number!!!
6804 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6807 float r
, resf
, coeff
;
6809 for (i
=0; i
<2; i
++){
6810 if((a
.m64_u32
[i
] & 0xc0000000) == 0) { //a <=0x3fffffff
6811 res
.m64_u32
[i
] = 0xffffffff;
6813 resf
= (float) (a
.m64_u32
[i
] * (0.5f
/ (uint32_t)(1 << 31)));
6814 coeff
= (resf
< 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6815 q0
= (int)(resf
* coeff
); /* a in units of 1/512 rounded down */
6816 r
= ((float)q0
+ 0.5) / coeff
;
6817 tmp
= _mm_rsqrt_ss(_mm_load_ss( &r
));/* reciprocal root r */
6818 _mm_store_ss(&r
, tmp
);
6819 s
= (int)(256.0 * r
+ 0.5); /* r in units of 1/256 rounded to nearest */
6820 r
= (float)s
/ 256.0;
6821 res
.m64_u32
[i
] = r
* (((uint32_t)1) << 31);
6827 float32x4_t
vrsqrteq_f32(float32x4_t a
); // VRSQRTE.F32 q0,q0
6828 #define vrsqrteq_f32 _mm_rsqrt_ps
6830 uint32x4_t
vrsqrteq_u32(uint32x4_t a
); // VRSQRTE.U32 q0,q0
6831 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vrsqrteq_u32(uint32x4_t a
), _NEON2SSE_REASON_SLOW_SERIAL
)
6833 //Input is fixed point number!!!
6834 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
6835 _NEON2SSE_ALIGN_16
uint32_t atmp
[4], res
[4];
6836 _NEON2SSE_ALIGN_16
float c1_31
[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
6837 _NEON2SSE_ALIGN_16
int c_c0000000
[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
6839 __m128i res128
, mask
, zero
;
6840 float r
, resf
, coeff
;
6842 _mm_store_si128((__m128i
*)atmp
, a
);
6843 zero
= _mm_setzero_si128();
6844 for (i
=0; i
<4; i
++){
6845 resf
= (float) (atmp
[i
] * (0.5f
/ (uint32_t)(1 << 31)));
6846 coeff
= (resf
< 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/
6847 q0
= (int)(resf
* coeff
); /* a in units of 1/512 rounded down */
6848 r
= ((float)q0
+ 0.5) / coeff
;
6849 tmp
= _mm_rsqrt_ss(_mm_load_ss( &r
));/* reciprocal root r */
6850 _mm_store_ss(&r
, tmp
);
6851 s
= (int)(256.0 * r
+ 0.5); /* r in units of 1/256 rounded to nearest */
6852 r
= (float)s
/ 256.0;
6853 res
[i
] = (uint32_t) (r
* (((uint32_t)1) << 31) );
6855 res128
= _mm_load_si128((__m128i
*)res
);
6856 mask
= _mm_and_si128(a
, *(__m128i
*)c_c0000000
);
6857 mask
= _mm_cmpeq_epi32(zero
, mask
); //0xffffffff if atmp[i] <= 0x3fffffff
6858 return _mm_or_si128(res128
, mask
);
6860 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
6861 //******************************************************************************************
6862 //******VRECPS (Vector Reciprocal Step) ***************************************************
6863 //multiplies the elements of one vector by the corresponding elements of another vector,
6864 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
6866 float32x2_t
vrecps_f32(float32x2_t a
, float32x2_t b
); // VRECPS.F32 d0, d0, d0
6867 _NEON2SSE_INLINE float32x2_t
vrecps_f32(float32x2_t a
, float32x2_t b
)
6871 res
= vrecpsq_f32(_pM128(a
), _pM128(b
));
6876 float32x4_t
vrecpsq_f32(float32x4_t a
, float32x4_t b
); // VRECPS.F32 q0, q0, q0
6877 _NEON2SSE_INLINE float32x4_t
vrecpsq_f32(float32x4_t a
, float32x4_t b
) // VRECPS.F32 q0, q0, q0
6880 f2
= _mm_set1_ps(2.);
6881 mul
= _mm_mul_ps(a
,b
);
6882 return _mm_sub_ps(f2
,mul
);
6885 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
6886 //multiplies the elements of one vector by the corresponding elements of another vector,
6887 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
6889 float32x2_t
vrsqrts_f32(float32x2_t a
, float32x2_t b
); // VRSQRTS.F32 d0, d0, d0
6890 _NEON2SSE_INLINE float32x2_t
vrsqrts_f32(float32x2_t a
, float32x2_t b
)
6893 res
.m64_f32
[0] = (3 - a
.m64_f32
[0] * b
.m64_f32
[0]) / 2;
6894 res
.m64_f32
[1] = (3 - a
.m64_f32
[1] * b
.m64_f32
[1]) / 2;
6898 float32x4_t
vrsqrtsq_f32(float32x4_t a
, float32x4_t b
); // VRSQRTS.F32 q0, q0, q0
6899 _NEON2SSE_INLINE float32x4_t
vrsqrtsq_f32(float32x4_t a
, float32x4_t b
) // VRSQRTS.F32 q0, q0, q0
6901 __m128 f3
, f05
, mul
;
6902 f3
= _mm_set1_ps(3.);
6903 f05
= _mm_set1_ps(0.5);
6904 mul
= _mm_mul_ps(a
,b
);
6905 f3
= _mm_sub_ps(f3
,mul
);
6906 return _mm_mul_ps (f3
, f05
);
6908 //********************************************************************************************
6909 //***************************** Shifts by signed variable ***********************************
6910 //********************************************************************************************
6911 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
6912 //********************************************************************************************
6913 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
6914 //helper macro. It matches ARM implementation for big shifts
6915 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
6916 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
6917 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
6918 for (i = 0; i<LEN; i++) { \
6919 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
6920 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
6921 return _mm_load_si128((__m128i*)res);
6923 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
6924 int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
6925 for (i = 0; i<LEN; i++) { \
6926 if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
6927 else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
6930 int8x8_t
vshl_s8(int8x8_t a
, int8x8_t b
); // VSHL.S8 d0,d0,d0
6931 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vshl_s8(int8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6933 SERIAL_SHIFT_64(8, i
, 8)
6936 int16x4_t
vshl_s16(int16x4_t a
, int16x4_t b
); // VSHL.S16 d0,d0,d0
6937 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vshl_s16(int16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6939 SERIAL_SHIFT_64(16, i
, 4)
6942 int32x2_t
vshl_s32(int32x2_t a
, int32x2_t b
); // VSHL.S32 d0,d0,d0
6943 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vshl_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6945 SERIAL_SHIFT_64(32, i
, 2)
6948 int64x1_t
vshl_s64(int64x1_t a
, int64x1_t b
); // VSHL.S64 d0,d0,d0
6949 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vshl_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6951 SERIAL_SHIFT_64(64, i
, 1)
6954 uint8x8_t
vshl_u8(uint8x8_t a
, int8x8_t b
); // VSHL.U8 d0,d0,d0
6955 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vshl_u8(uint8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6957 SERIAL_SHIFT_64(8, u
, 8)
6960 uint16x4_t
vshl_u16(uint16x4_t a
, int16x4_t b
); // VSHL.s16 d0,d0,d0
6961 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vshl_u16(uint16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6963 SERIAL_SHIFT_64(16, u
, 4)
6966 uint32x2_t
vshl_u32(uint32x2_t a
, int32x2_t b
); // VSHL.U32 d0,d0,d0
6967 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vshl_u32(uint32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6969 SERIAL_SHIFT_64(32, u
, 2)
6972 uint64x1_t
vshl_u64(uint64x1_t a
, int64x1_t b
); // VSHL.U64 d0,d0,d0
6973 _NEON2SSE_INLINE uint64x1_t
vshl_u64(uint64x1_t a
, int64x1_t b
) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers
6975 SERIAL_SHIFT_64(64, u
, 1)
6978 int8x16_t
vshlq_s8(int8x16_t a
, int8x16_t b
); // VSHL.S8 q0,q0,q0
6979 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t
vshlq_s8(int8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6981 SERIAL_SHIFT(int8_t, int8_t, 16, 16)
6984 int16x8_t
vshlq_s16(int16x8_t a
, int16x8_t b
); // VSHL.S16 q0,q0,q0
6985 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t
vshlq_s16(int16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6987 SERIAL_SHIFT(int16_t, int16_t, 8, 8)
6990 int32x4_t
vshlq_s32(int32x4_t a
, int32x4_t b
); // VSHL.S32 q0,q0,q0
6991 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vshlq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6993 SERIAL_SHIFT(int32_t, int32_t, 4, 4)
6996 int64x2_t
vshlq_s64(int64x2_t a
, int64x2_t b
); // VSHL.S64 q0,q0,q0
6997 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vshlq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
6999 SERIAL_SHIFT(int64_t, int64_t, 2, 2)
7002 uint8x16_t
vshlq_u8(uint8x16_t a
, int8x16_t b
); // VSHL.U8 q0,q0,q0
7003 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t
vshlq_u8(uint8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7005 SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
7008 uint16x8_t
vshlq_u16(uint16x8_t a
, int16x8_t b
); // VSHL.s16 q0,q0,q0
7009 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t
vshlq_u16(uint16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7011 SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
7014 uint32x4_t
vshlq_u32(uint32x4_t a
, int32x4_t b
); // VSHL.U32 q0,q0,q0
7015 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vshlq_u32(uint32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7017 SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
7020 uint64x2_t
vshlq_u64(uint64x2_t a
, int64x2_t b
); // VSHL.U64 q0,q0,q0
7021 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING( uint64x2_t
vshlq_u64(uint64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7023 SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
7027 //*********** Vector saturating shift left: (negative values shift right) **********************
7028 //********************************************************************************************
7029 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7030 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7031 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7032 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7033 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7034 for (i = 0; i<LEN; i++) { \
7035 if (atmp[i] ==0) res[i] = 0; \
7037 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
7039 if (btmp[i]>lanesize_1) { \
7040 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7042 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7043 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7044 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7045 else res[i] = atmp[i] << btmp[i]; }}}} \
7046 return _mm_load_si128((__m128i*)res);
7048 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7049 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7050 TYPE lanesize = (sizeof(TYPE) << 3); \
7051 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7052 for (i = 0; i<LEN; i++) { \
7053 if (atmp[i] ==0) {res[i] = 0; \
7055 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
7057 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7059 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7060 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7061 return _mm_load_si128((__m128i*)res);
7063 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
7064 int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
7065 int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
7066 for (i = 0; i<LEN; i++) { \
7067 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7069 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7071 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7072 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7074 limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7075 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7076 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
7077 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7080 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7081 int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7082 int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7083 for (i = 0; i<LEN; i++) { \
7084 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7086 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
7088 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7090 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7091 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
7094 int8x8_t
vqshl_s8(int8x8_t a
, int8x8_t b
); // VQSHL.S8 d0,d0,d0
7095 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vqshl_s8(int8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7097 SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
7100 int16x4_t
vqshl_s16(int16x4_t a
, int16x4_t b
); // VQSHL.S16 d0,d0,d0
7101 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vqshl_s16(int16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7103 SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
7106 int32x2_t
vqshl_s32(int32x2_t a
, int32x2_t b
); // VQSHL.S32 d0,d0,d0
7107 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqshl_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7109 SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
7112 int64x1_t
vqshl_s64(int64x1_t a
, int64x1_t b
); // VQSHL.S64 d0,d0,d0
7113 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vqshl_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7115 SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
7118 uint8x8_t
vqshl_u8(uint8x8_t a
, int8x8_t b
); // VQSHL.U8 d0,d0,d0
7119 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vqshl_u8(uint8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7121 SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
7124 uint16x4_t
vqshl_u16(uint16x4_t a
, int16x4_t b
); // VQSHL.s16 d0,d0,d0
7125 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vqshl_u16(uint16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7127 SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
7130 uint32x2_t
vqshl_u32(uint32x2_t a
, int32x2_t b
); // VQSHL.U32 d0,d0,d0
7131 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vqshl_u32(uint32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7133 SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
7136 uint64x1_t
vqshl_u64(uint64x1_t a
, int64x1_t b
); // VQSHL.U64 d0,d0,d0
7137 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqshl_u64(uint64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7139 SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
7142 int8x16_t
vqshlq_s8(int8x16_t a
, int8x16_t b
); // VQSHL.S8 q0,q0,q0
7143 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t
vqshlq_s8(int8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7145 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
7148 int16x8_t
vqshlq_s16(int16x8_t a
, int16x8_t b
); // VQSHL.S16 q0,q0,q0
7149 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t
vqshlq_s16(int16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7151 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
7154 int32x4_t
vqshlq_s32(int32x4_t a
, int32x4_t b
); // VQSHL.S32 q0,q0,q0
7155 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqshlq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7157 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
7160 int64x2_t
vqshlq_s64(int64x2_t a
, int64x2_t b
); // VQSHL.S64 q0,q0,q0
7161 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqshlq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7163 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
7166 uint8x16_t
vqshlq_u8(uint8x16_t a
, int8x16_t b
); // VQSHL.U8 q0,q0,q0
7167 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t
vqshlq_u8(uint8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7169 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
7172 uint16x8_t
vqshlq_u16(uint16x8_t a
, int16x8_t b
); // VQSHL.s16 q0,q0,q0
7173 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t
vqshlq_u16(uint16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7175 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
7178 uint32x4_t
vqshlq_u32(uint32x4_t a
, int32x4_t b
); // VQSHL.U32 q0,q0,q0
7179 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vqshlq_u32(uint32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7181 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
7184 uint64x2_t
vqshlq_u64(uint64x2_t a
, int64x2_t b
); // VQSHL.U64 q0,q0,q0
7185 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqshlq_u64(uint64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7187 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
7191 //******** Vector rounding shift left: (negative values shift right) **********
7192 //****************************************************************************
7193 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
7194 //rounding makes sense for right shifts only.
7195 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
7196 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
7197 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7198 for (i = 0; i<LEN; i++) { \
7199 if( btmp[i] >= 0) { \
7200 if(btmp[i] >= lanesize) res[i] = 0; \
7201 else res[i] = (atmp[i] << btmp[i]); \
7203 res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
7204 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
7205 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \
7206 return _mm_load_si128((__m128i*)res);
7209 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
7210 int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \
7211 for (i = 0; i<LEN; i++) { \
7212 if( b.m64_i ## TYPE[i] >= 0) { \
7213 if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
7214 else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
7216 res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
7217 (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
7218 (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \
7222 int8x8_t
vrshl_s8(int8x8_t a
, int8x8_t b
); // VRSHL.S8 d0,d0,d0
7223 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vrshl_s8(int8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7225 SERIAL_ROUNDING_SHIFT_64(8,i
,8)
7228 int16x4_t
vrshl_s16(int16x4_t a
, int16x4_t b
); // VRSHL.S16 d0,d0,d0
7229 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vrshl_s16(int16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7231 SERIAL_ROUNDING_SHIFT_64(16,i
,4)
7234 int32x2_t
vrshl_s32(int32x2_t a
, int32x2_t b
); // VRSHL.S32 d0,d0,d0
7235 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vrshl_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7237 SERIAL_ROUNDING_SHIFT_64(32,i
,2)
7240 int64x1_t
vrshl_s64(int64x1_t a
, int64x1_t b
); // VRSHL.S64 d0,d0,d0
7241 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vrshl_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7243 SERIAL_ROUNDING_SHIFT_64(64,i
,1)
7246 uint8x8_t
vrshl_u8(uint8x8_t a
, int8x8_t b
); // VRSHL.U8 d0,d0,d0
7247 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vrshl_u8(uint8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7249 SERIAL_ROUNDING_SHIFT_64(8,u
,8)
7252 uint16x4_t
vrshl_u16(uint16x4_t a
, int16x4_t b
); // VRSHL.s16 d0,d0,d0
7253 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vrshl_u16(uint16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7255 SERIAL_ROUNDING_SHIFT_64(16,u
,4)
7258 uint32x2_t
vrshl_u32(uint32x2_t a
, int32x2_t b
); // VRSHL.U32 d0,d0,d0
7259 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vrshl_u32(uint32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7261 SERIAL_ROUNDING_SHIFT_64(32,u
,2)
7264 uint64x1_t
vrshl_u64(uint64x1_t a
, int64x1_t b
); // VRSHL.U64 d0,d0,d0
7265 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vrshl_u64(uint64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7267 SERIAL_ROUNDING_SHIFT_64(64,u
,1)
7270 int8x16_t
vrshlq_s8(int8x16_t a
, int8x16_t b
); // VRSHL.S8 q0,q0,q0
7271 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t
vrshlq_s8(int8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7273 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
7276 int16x8_t
vrshlq_s16(int16x8_t a
, int16x8_t b
); // VRSHL.S16 q0,q0,q0
7277 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t
vrshlq_s16(int16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7279 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
7282 int32x4_t
vrshlq_s32(int32x4_t a
, int32x4_t b
); // VRSHL.S32 q0,q0,q0
7283 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vrshlq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7285 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
7288 int64x2_t
vrshlq_s64(int64x2_t a
, int64x2_t b
); // VRSHL.S64 q0,q0,q0
7289 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vrshlq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7291 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
7294 uint8x16_t
vrshlq_u8(uint8x16_t a
, int8x16_t b
); // VRSHL.U8 q0,q0,q0
7295 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t
vrshlq_u8(uint8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7297 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
7300 uint16x8_t
vrshlq_u16(uint16x8_t a
, int16x8_t b
); // VRSHL.s16 q0,q0,q0
7301 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t
vrshlq_u16(uint16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7303 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
7306 uint32x4_t
vrshlq_u32(uint32x4_t a
, int32x4_t b
); // VRSHL.U32 q0,q0,q0
7307 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vrshlq_u32(uint32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7309 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
7312 uint64x2_t
vrshlq_u64(uint64x2_t a
, int64x2_t b
); // VRSHL.U64 q0,q0,q0
7313 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vrshlq_u64(uint64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7315 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
7319 //********** Vector saturating rounding shift left: (negative values shift right) ****************
7320 //*************************************************************************************************
7321 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
7322 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
7323 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
7324 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
7325 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
7326 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7327 for (i = 0; i<LEN; i++) { \
7328 if (atmp[i] ==0) res[i] = 0; \
7330 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7332 if (btmp[i]>lanesize_1) { \
7333 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7335 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
7336 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
7337 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
7338 else res[i] = atmp[i] << btmp[i]; }}}} \
7339 return _mm_load_si128((__m128i*)res);
7341 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
7342 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
7343 int lanesize = (sizeof(TYPE) << 3); \
7344 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
7345 for (i = 0; i<LEN; i++) { \
7346 if (atmp[i] ==0) {res[i] = 0; \
7348 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
7350 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
7352 limit = (TYPE) 1 << (lanesize - btmp[i]); \
7353 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
7354 return _mm_load_si128((__m128i*)res);
7356 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
7357 __m64_128 res; int ## TYPE ## _t limit; int i; \
7358 int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
7359 for (i = 0; i<LEN; i++) { \
7360 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
7362 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7364 if (b.m64_i ## TYPE[i]>lanesize_1) { \
7365 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7367 limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
7368 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
7369 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
7370 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7373 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
7374 __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
7375 int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
7376 for (i = 0; i<LEN; i++) { \
7377 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
7379 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
7381 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
7383 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
7384 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
7387 int8x8_t
vqrshl_s8(int8x8_t a
, int8x8_t b
); // VQRSHL.S8 d0,d0,d0
7388 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vqrshl_s8(int8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7390 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
7393 int16x4_t
vqrshl_s16(int16x4_t a
, int16x4_t b
); // VQRSHL.S16 d0,d0,d0
7394 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vqrshl_s16(int16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7396 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
7399 int32x2_t
vqrshl_s32(int32x2_t a
, int32x2_t b
); // VQRSHL.S32 d0,d0,d0
7400 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqrshl_s32(int32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7402 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
7405 int64x1_t
vqrshl_s64(int64x1_t a
, int64x1_t b
); // VQRSHL.S64 d0,d0,d0
7406 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vqrshl_s64(int64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7408 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
7411 uint8x8_t
vqrshl_u8(uint8x8_t a
, int8x8_t b
); // VQRSHL.U8 d0,d0,d0
7412 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vqrshl_u8(uint8x8_t a
, int8x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7414 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
7417 uint16x4_t
vqrshl_u16(uint16x4_t a
, int16x4_t b
); // VQRSHL.s16 d0,d0,d0
7418 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vqrshl_u16(uint16x4_t a
, int16x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7420 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
7423 uint32x2_t
vqrshl_u32(uint32x2_t a
, int32x2_t b
); // VQRSHL.U32 d0,d0,d0
7424 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vqrshl_u32(uint32x2_t a
, int32x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7426 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
7429 uint64x1_t
vqrshl_u64(uint64x1_t a
, int64x1_t b
); // VQRSHL.U64 d0,d0,d0
7430 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqrshl_u64(uint64x1_t a
, int64x1_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7432 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
7435 int8x16_t
vqrshlq_s8(int8x16_t a
, int8x16_t b
); // VQRSHL.S8 q0,q0,q0
7436 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x16_t
vqrshlq_s8(int8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7438 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
7441 int16x8_t
vqrshlq_s16(int16x8_t a
, int16x8_t b
); // VQRSHL.S16 q0,q0,q0
7442 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x8_t
vqrshlq_s16(int16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7444 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
7447 int32x4_t
vqrshlq_s32(int32x4_t a
, int32x4_t b
); // VQRSHL.S32 q0,q0,q0
7448 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqrshlq_s32(int32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7450 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
7453 int64x2_t
vqrshlq_s64(int64x2_t a
, int64x2_t b
); // VQRSHL.S64 q0,q0,q0
7454 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqrshlq_s64(int64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7456 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
7459 uint8x16_t
vqrshlq_u8(uint8x16_t a
, int8x16_t b
); // VQRSHL.U8 q0,q0,q0
7460 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x16_t
vqrshlq_u8(uint8x16_t a
, int8x16_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7462 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
7465 uint16x8_t
vqrshlq_u16(uint16x8_t a
, int16x8_t b
); // VQRSHL.s16 q0,q0,q0
7466 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x8_t
vqrshlq_u16(uint16x8_t a
, int16x8_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7468 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
7471 uint32x4_t
vqrshlq_u32(uint32x4_t a
, int32x4_t b
); // VQRSHL.U32 q0,q0,q0
7472 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x4_t
vqrshlq_u32(uint32x4_t a
, int32x4_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7474 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
7477 uint64x2_t
vqrshlq_u64(uint64x2_t a
, int64x2_t b
); // VQRSHL.U64 q0,q0,q0
7478 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqrshlq_u64(uint64x2_t a
, int64x2_t b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7480 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
7483 // *********************************************************************************
7484 // ***************************** Shifts by a constant *****************************
7485 // *********************************************************************************
7486 //**************** Vector shift right by constant*************************************
7487 //************************************************************************************
7488 int8x8_t
vshr_n_s8(int8x8_t a
, __constrange(1,8) int b
); // VSHR.S8 d0,d0,#8
7489 _NEON2SSE_INLINE int8x8_t
vshr_n_s8(int8x8_t a
, __constrange(1,8) int b
) // VSHR.S8 d0,d0,#8
7491 //no 8 bit shift available, go to 16 bit
7494 r
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
7495 r
= _mm_srai_epi16 (r
, b
); //SSE2
7496 r
= _mm_packs_epi16 (r
,r
); //we need 64 bits only
7500 int16x4_t
vshr_n_s16(int16x4_t a
, __constrange(1,16) int b
); // VSHR.S16 d0,d0,#16
7501 _NEON2SSE_INLINE int16x4_t
vshr_n_s16(int16x4_t a
, __constrange(1,16) int b
)
7504 return64(_mm_srai_epi16(_pM128i(a
), b
));
7508 int32x2_t
vshr_n_s32(int32x2_t a
, __constrange(1,32) int b
); // VSHR.S32 d0,d0,#32
7509 _NEON2SSE_INLINE int32x2_t
vshr_n_s32(int32x2_t a
, __constrange(1,32) int b
)
7512 return64(_mm_srai_epi32(_pM128i(a
), b
));
7515 int64x1_t
vshr_n_s64(int64x1_t a
, __constrange(1,64) int b
); // VSHR.S64 d0,d0,#64
7516 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vshr_n_s64(int64x1_t a
, __constrange(1,64) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7518 //no arithmetic shift for 64bit values, serial solution used
7520 if(b
>=64) res
.m64_i64
[0] = 0;
7521 else res
.m64_i64
[0] = (*(int64_t*)&a
) >> b
;
7525 uint8x8_t
vshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
); // VSHR.U8 d0,d0,#8
7526 _NEON2SSE_INLINE uint8x8_t
vshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
) // VSHR.U8 d0,d0,#8
7528 //no 8 bit shift available, go to 16 bit
7531 r
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE 4.1
7532 r
= _mm_srli_epi16 (r
, b
); //for unsigned variables we use the logical shift not arithmetical one
7533 r
= _mm_packus_epi16 (r
,r
); //we need 64 bits only
7537 uint16x4_t
vshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
); // VSHR.s16 d0,d0,#16
7538 _NEON2SSE_INLINE uint16x4_t
vshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
)
7541 return64(_mm_srli_epi16(_pM128i(a
), b
));
7545 uint32x2_t
vshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
); // VSHR.U32 d0,d0,#32
7546 _NEON2SSE_INLINE uint32x2_t
vshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
)
7549 return64(_mm_srli_epi32(_pM128i(a
), b
));
7553 uint64x1_t
vshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
); // VSHR.U64 d0,d0,#64
7554 _NEON2SSE_INLINE uint64x1_t
vshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
)
7557 return64(_mm_srli_epi64(_pM128i(a
), b
));
7561 int8x16_t
vshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
); // VSHR.S8 q0,q0,#8
7562 _NEON2SSE_INLINE int8x16_t
vshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
) // VSHR.S8 q0,q0,#8
7564 //no 8 bit shift available, go to 16 bit trick
7565 __m128i zero
, mask0
, a_sign
, r
, a_sign_mask
;
7566 _NEON2SSE_ALIGN_16
int16_t mask0_16
[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff};
7567 zero
= _mm_setzero_si128();
7568 mask0
= _mm_set1_epi16(mask0_16
[b
]); //to mask the bits to be "spoiled" by 16 bit shift
7569 a_sign
= _mm_cmpgt_epi8 (zero
, a
); //ff if a<0 or zero if a>0
7570 r
= _mm_srai_epi16 (a
, b
);
7571 a_sign_mask
= _mm_and_si128 (mask0
, a_sign
);
7572 r
= _mm_andnot_si128 (mask0
, r
);
7573 return _mm_or_si128 (r
, a_sign_mask
);
7576 int16x8_t
vshrq_n_s16(int16x8_t a
, __constrange(1,16) int b
); // VSHR.S16 q0,q0,#16
7577 #define vshrq_n_s16 _mm_srai_epi16
7579 int32x4_t
vshrq_n_s32(int32x4_t a
, __constrange(1,32) int b
); // VSHR.S32 q0,q0,#32
7580 #define vshrq_n_s32 _mm_srai_epi32
7582 int64x2_t
vshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
); // VSHR.S64 q0,q0,#64
7583 _NEON2SSE_INLINE int64x2_t
vshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
)
7585 //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
7586 __m128i c1
, signmask
,a0
, res64
;
7587 _NEON2SSE_ALIGN_16
uint64_t mask
[] = {0x8000000000000000, 0x8000000000000000};
7588 c1
= _mm_cmpeq_epi32(a
,a
); //0xffffffffffffffff
7589 signmask
= _mm_slli_epi64 (c1
, (64 - b
));
7590 a0
= _mm_or_si128(a
, *(__m128i
*)mask
); //get the first bit
7591 a0
= _MM_CMPEQ_EPI64 (a
, a0
);
7592 signmask
= _mm_and_si128(a0
, signmask
);
7593 res64
= _mm_srli_epi64 (a
, b
);
7594 return _mm_or_si128(res64
, signmask
);
7597 uint8x16_t
vshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
); // VSHR.U8 q0,q0,#8
7598 _NEON2SSE_INLINE uint8x16_t
vshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
) // VSHR.U8 q0,q0,#8
7600 //no 8 bit shift available, need the special trick
7602 _NEON2SSE_ALIGN_16
uint16_t mask10_16
[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00};
7603 mask0
= _mm_set1_epi16(mask10_16
[b
]); //to mask the bits to be "spoiled" by 16 bit shift
7604 r
= _mm_srli_epi16 ( a
, b
);
7605 return _mm_and_si128 (r
, mask0
);
7608 uint16x8_t
vshrq_n_u16(uint16x8_t a
, __constrange(1,16) int b
); // VSHR.s16 q0,q0,#16
7609 #define vshrq_n_u16 _mm_srli_epi16
7611 uint32x4_t
vshrq_n_u32(uint32x4_t a
, __constrange(1,32) int b
); // VSHR.U32 q0,q0,#32
7612 #define vshrq_n_u32 _mm_srli_epi32
7614 uint64x2_t
vshrq_n_u64(uint64x2_t a
, __constrange(1,64) int b
); // VSHR.U64 q0,q0,#64
7615 #define vshrq_n_u64 _mm_srli_epi64
7617 //*************************** Vector shift left by constant *************************
7618 //*********************************************************************************
7619 int8x8_t
vshl_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VSHL.I8 d0,d0,#0
7620 _NEON2SSE_INLINE int8x8_t
vshl_n_s8(int8x8_t a
, __constrange(0,7) int b
) // VSHL.I8 d0,d0,#0
7622 //no 8 bit shift available, go to 16 bit
7625 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
7626 r
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
7627 r
= _mm_slli_epi16 (r
, b
); //SSE2
7628 r
= _mm_shuffle_epi8 (r
, *(__m128i
*) mask8_16_even_odd
); //return to 8 bit, we need 64 bits only
7632 int16x4_t
vshl_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VSHL.I16 d0,d0,#0
7633 _NEON2SSE_INLINE int16x4_t
vshl_n_s16(int16x4_t a
, __constrange(0,15) int b
)
7636 return64(_mm_slli_epi16(_pM128i(a
), b
));
7640 int32x2_t
vshl_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VSHL.I32 d0,d0,#0
7641 _NEON2SSE_INLINE int32x2_t
vshl_n_s32(int32x2_t a
, __constrange(0,31) int b
)
7644 return64(_mm_slli_epi32(_pM128i(a
), b
));
7648 int64x1_t
vshl_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VSHL.I64 d0,d0,#0
7649 _NEON2SSE_INLINE int64x1_t
vshl_n_s64(int64x1_t a
, __constrange(0,63) int b
)
7652 return64(_mm_slli_epi64(_pM128i(a
), b
));
7656 uint8x8_t
vshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
); // VSHL.I8 d0,d0,#0
7657 _NEON2SSE_INLINE uint8x8_t
vshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
)
7659 //no 8 bit shift available, go to 16 bit
7663 mask8
= _mm_set1_epi16(0xff);
7664 r
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE 4.1
7665 r
= _mm_slli_epi16 (r
, b
); //SSE2
7666 r
= _mm_and_si128(r
, mask8
); //to avoid saturation
7667 r
= _mm_packus_epi16 (r
,r
); //we need 64 bits only
7671 uint16x4_t
vshl_n_u16(uint16x4_t a
, __constrange(0,15) int b
); // VSHL.I16 d0,d0,#0
7672 #define vshl_n_u16 vshl_n_s16
7675 uint32x2_t
vshl_n_u32(uint32x2_t a
, __constrange(0,31) int b
); // VSHL.I32 d0,d0,#0
7676 #define vshl_n_u32 vshl_n_s32
7678 uint64x1_t
vshl_n_u64(uint64x1_t a
, __constrange(0,63) int b
); // VSHL.I64 d0,d0,#0
7679 #define vshl_n_u64 vshl_n_s64
7681 int8x16_t
vshlq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VSHL.I8 q0,q0,#0
7682 #define vshlq_n_s8 vshlq_n_u8
7684 int16x8_t
vshlq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VSHL.I16 q0,q0,#0
7685 #define vshlq_n_s16 _mm_slli_epi16
7687 int32x4_t
vshlq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VSHL.I32 q0,q0,#0
7688 #define vshlq_n_s32 _mm_slli_epi32
7690 int64x2_t
vshlq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VSHL.I64 q0,q0,#0
7691 #define vshlq_n_s64 _mm_slli_epi64
7693 uint8x16_t
vshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
); // VSHL.I8 q0,q0,#0
7694 _NEON2SSE_INLINE uint8x16_t
vshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
)
7696 //no 8 bit shift available, need the special trick
7698 _NEON2SSE_ALIGN_16
uint16_t mask10_16
[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff};
7699 mask0
= _mm_set1_epi16(mask10_16
[b
]); //to mask the bits to be "spoiled" by 16 bit shift
7700 r
= _mm_slli_epi16 ( a
, b
);
7701 return _mm_and_si128 (r
, mask0
);
7704 uint16x8_t
vshlq_n_u16(uint16x8_t a
, __constrange(0,15) int b
); // VSHL.I16 q0,q0,#0
7705 #define vshlq_n_u16 vshlq_n_s16
7707 uint32x4_t
vshlq_n_u32(uint32x4_t a
, __constrange(0,31) int b
); // VSHL.I32 q0,q0,#0
7708 #define vshlq_n_u32 vshlq_n_s32
7710 uint64x2_t
vshlq_n_u64(uint64x2_t a
, __constrange(0,63) int b
); // VSHL.I64 q0,q0,#0
7711 #define vshlq_n_u64 vshlq_n_s64
7713 //************* Vector rounding shift right by constant ******************
7714 //*************************************************************************
7715 //No corresponding x86 intrinsics exist, need to do some tricks
7716 int8x8_t
vrshr_n_s8(int8x8_t a
, __constrange(1,8) int b
); // VRSHR.S8 d0,d0,#8
7717 _NEON2SSE_INLINE int8x8_t
vrshr_n_s8(int8x8_t a
, __constrange(1,8) int b
) // VRSHR.S8 d0,d0,#8
7719 //no 8 bit shift available, go to 16 bit
7722 r
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
7723 maskb
= _mm_slli_epi16 (r
, (16 - b
)); //to get rounding (b-1)th bit
7724 maskb
= _mm_srli_epi16 (maskb
, 15); //1 or 0
7725 r
= _mm_srai_epi16 (r
, b
);
7726 r
= _mm_add_epi16 (r
, maskb
); //actual rounding
7727 r
= _mm_packs_epi16 (r
,r
); ////we need 64 bits only
7731 int16x4_t
vrshr_n_s16(int16x4_t a
, __constrange(1,16) int b
); // VRSHR.S16 d0,d0,#16
7732 _NEON2SSE_INLINE int16x4_t
vrshr_n_s16(int16x4_t a
, __constrange(1,16) int b
)
7735 return64(vrshrq_n_s16(_pM128i(a
), b
));
7739 int32x2_t
vrshr_n_s32(int32x2_t a
, __constrange(1,32) int b
); // VRSHR.S32 d0,d0,#32
7740 _NEON2SSE_INLINE int32x2_t
vrshr_n_s32(int32x2_t a
, __constrange(1,32) int b
)
7743 return64(vrshrq_n_s32(_pM128i(a
), b
));
7747 int64x1_t
vrshr_n_s64(int64x1_t a
, __constrange(1,64) int b
); // VRSHR.S64 d0,d0,#64
7748 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vrshr_n_s64(int64x1_t a
, __constrange(1,64) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
7750 //serial solution is faster
7752 int64_t a_i64
= *( int64_t*)&a
;
7754 res
.m64_i64
[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
7756 int64_t maskb
= a_i64
& (( int64_t)1 << (b
- 1));
7757 res
.m64_i64
[0] = (a_i64
>> b
) + (maskb
>> (b
- 1));
7762 uint8x8_t
vrshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
); // VRSHR.U8 d0,d0,#8
7763 _NEON2SSE_INLINE uint8x8_t
vrshr_n_u8(uint8x8_t a
, __constrange(1,8) int b
) // VRSHR.U8 d0,d0,#8
7765 //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
7768 r
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE 4.1
7769 maskb
= _mm_slli_epi16 (r
, (16 - b
)); //to get rounding (b-1)th bit
7770 maskb
= _mm_srli_epi16 (maskb
, 15); //1 or 0
7771 r
= _mm_srli_epi16 (r
, b
);
7772 r
= _mm_add_epi16 (r
, maskb
); //actual rounding
7773 r
= _mm_packus_epi16 (r
,r
); ////we need 64 bits only
7777 uint16x4_t
vrshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
); // VRSHR.s16 d0,d0,#16
7778 _NEON2SSE_INLINE uint16x4_t
vrshr_n_u16(uint16x4_t a
, __constrange(1,16) int b
)
7781 return64(vrshrq_n_u16(_pM128i(a
), b
));
7785 uint32x2_t
vrshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
); // VRSHR.U32 d0,d0,#32
7786 _NEON2SSE_INLINE uint32x2_t
vrshr_n_u32(uint32x2_t a
, __constrange(1,32) int b
)
7789 return64(vrshrq_n_u32(_pM128i(a
), b
));
7793 uint64x1_t
vrshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
); // VRSHR.U64 d0,d0,#64
7794 _NEON2SSE_INLINE uint64x1_t
vrshr_n_u64(uint64x1_t a
, __constrange(1,64) int b
)
7797 return64(vrshrq_n_u64(_pM128i(a
), b
));
7800 int8x16_t
vrshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
); // VRSHR.S8 q0,q0,#8
7801 _NEON2SSE_INLINE int8x16_t
vrshrq_n_s8(int8x16_t a
, __constrange(1,8) int b
) // VRSHR.S8 q0,q0,#8
7803 //no 8 bit shift available, go to 16 bit trick
7804 __m128i r
, mask1
, maskb
;
7805 _NEON2SSE_ALIGN_16
uint16_t mask2b
[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7806 r
= vshrq_n_s8 (a
, b
);
7807 mask1
= _mm_set1_epi16(mask2b
[b
]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7808 maskb
= _mm_and_si128(a
, mask1
); //get b or 0 for rounding
7809 maskb
= _mm_srli_epi16 (maskb
, b
- 1); // to add 1
7810 return _mm_add_epi8(r
, maskb
); //actual rounding
7813 int16x8_t
vrshrq_n_s16(int16x8_t a
, __constrange(1,16) int b
); // VRSHR.S16 q0,q0,#16
7814 _NEON2SSE_INLINE int16x8_t
vrshrq_n_s16(int16x8_t a
, __constrange(1,16) int b
) // VRSHR.S16 q0,q0,#16
7817 maskb
= _mm_slli_epi16(a
, (16 - b
)); //to get rounding (b-1)th bit
7818 maskb
= _mm_srli_epi16(maskb
, 15); //1 or 0
7819 r
= _mm_srai_epi16 (a
, b
);
7820 return _mm_add_epi16 (r
, maskb
); //actual rounding
7823 int32x4_t
vrshrq_n_s32(int32x4_t a
, __constrange(1,32) int b
); // VRSHR.S32 q0,q0,#32
7824 _NEON2SSE_INLINE int32x4_t
vrshrq_n_s32(int32x4_t a
, __constrange(1,32) int b
) // VRSHR.S32 q0,q0,#32
7827 maskb
= _mm_slli_epi32 (a
, (32 - b
)); //to get rounding (b-1)th bit
7828 maskb
= _mm_srli_epi32 (maskb
,31); //1 or 0
7829 r
= _mm_srai_epi32(a
, b
);
7830 return _mm_add_epi32 (r
, maskb
); //actual rounding
7833 int64x2_t
vrshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
); // VRSHR.S64 q0,q0,#64
7834 _NEON2SSE_INLINE int64x2_t
vrshrq_n_s64(int64x2_t a
, __constrange(1,64) int b
)
7836 //solution may be not optimal compared with a serial one
7839 maskb
= _mm_slli_epi64 (a
, (64 - b
)); //to get rounding (b-1)th bit
7840 maskb
= _mm_srli_epi64 (maskb
,63); //1 or 0
7841 r
= vshrq_n_s64(a
, b
);
7842 return _mm_add_epi64 (r
, maskb
); //actual rounding
7845 uint8x16_t
vrshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
); // VRSHR.U8 q0,q0,#8
7846 _NEON2SSE_INLINE uint8x16_t
vrshrq_n_u8(uint8x16_t a
, __constrange(1,8) int b
) // VRSHR.U8 q0,q0,#8
7848 //no 8 bit shift available, go to 16 bit trick
7849 __m128i r
, mask1
, maskb
;
7850 _NEON2SSE_ALIGN_16
uint16_t mask2b
[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
7851 r
= vshrq_n_u8 (a
, b
);
7852 mask1
= _mm_set1_epi16(mask2b
[b
]); // 2^b-th bit set to 1 for 16bit, need it for rounding
7853 maskb
= _mm_and_si128(a
, mask1
); //get b or 0 for rounding
7854 maskb
= _mm_srli_epi16 (maskb
, b
- 1); // to add 1
7855 return _mm_add_epi8(r
, maskb
); //actual rounding
7858 uint16x8_t
vrshrq_n_u16(uint16x8_t a
, __constrange(1,16) int b
); // VRSHR.s16 q0,q0,#16
7859 _NEON2SSE_INLINE uint16x8_t
vrshrq_n_u16(uint16x8_t a
, __constrange(1,16) int b
) // VRSHR.S16 q0,q0,#16
7862 maskb
= _mm_slli_epi16(a
, (16 - b
)); //to get rounding (b-1)th bit
7863 maskb
= _mm_srli_epi16(maskb
, 15); //1 or 0
7864 r
= _mm_srli_epi16 (a
, b
);
7865 return _mm_add_epi16 (r
, maskb
); //actual rounding
7868 uint32x4_t
vrshrq_n_u32(uint32x4_t a
, __constrange(1,32) int b
); // VRSHR.U32 q0,q0,#32
7869 _NEON2SSE_INLINE uint32x4_t
vrshrq_n_u32(uint32x4_t a
, __constrange(1,32) int b
) // VRSHR.S32 q0,q0,#32
7872 maskb
= _mm_slli_epi32 (a
, (32 - b
)); //to get rounding (b-1)th bit
7873 maskb
= _mm_srli_epi32 (maskb
,31); //1 or 0
7874 r
= _mm_srli_epi32(a
, b
);
7875 return _mm_add_epi32 (r
, maskb
); //actual rounding
7878 uint64x2_t
vrshrq_n_u64(uint64x2_t a
, __constrange(1,64) int b
); // VRSHR.U64 q0,q0,#64
7879 _NEON2SSE_INLINE uint64x2_t
vrshrq_n_u64(uint64x2_t a
, __constrange(1,64) int b
)
7881 //solution may be not optimal compared with a serial one
7883 maskb
= _mm_slli_epi64 (a
, (64 - b
)); //to get rounding (b-1)th bit
7884 maskb
= _mm_srli_epi64 (maskb
,63); //1 or 0
7885 r
= _mm_srli_epi64(a
, b
);
7886 return _mm_add_epi64 (r
, maskb
); //actual rounding
7889 //************* Vector shift right by constant and accumulate *********
7890 //*********************************************************************
7891 int8x8_t
vsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VSRA.S8 d0,d0,#8
7892 _NEON2SSE_INLINE int8x8_t
vsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
) // VSRA.S8 d0,d0,#8
7895 shift
= vshr_n_s8(b
, c
);
7896 return vadd_s8( a
, shift
);
7899 int16x4_t
vsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VSRA.S16 d0,d0,#16
7900 _NEON2SSE_INLINE int16x4_t
vsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
) // VSRA.S16 d0,d0,#16
7903 shift
= vshr_n_s16( b
, c
);
7904 return vadd_s16(a
, shift
);
7907 int32x2_t
vsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VSRA.S32 d0,d0,#32
7908 _NEON2SSE_INLINE int32x2_t
vsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
) // VSRA.S32 d0,d0,#32
7910 //may be not optimal compared with the serial execution
7912 shift
= vshr_n_s32(b
, c
);
7913 return vadd_s32( a
, shift
);
7916 int64x1_t
vsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VSRA.S64 d0,d0,#64
7917 _NEON2SSE_INLINE int64x1_t
vsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
)
7919 //may be not optimal compared with a serial solution
7921 shift
= vshr_n_s64(b
, c
);
7922 return vadd_s64( a
, shift
);
7925 uint8x8_t
vsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VSRA.U8 d0,d0,#8
7926 _NEON2SSE_INLINE uint8x8_t
vsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
) // VSRA.U8 d0,d0,#8
7929 shift
= vshr_n_u8(b
, c
);
7930 return vadd_u8(a
, shift
);
7933 uint16x4_t
vsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VSRA.s16 d0,d0,#16
7934 _NEON2SSE_INLINE uint16x4_t
vsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
) // VSRA.s16 d0,d0,#16
7937 shift
= vshr_n_u16(b
, c
);
7938 return vadd_u16(a
,shift
);
7941 uint32x2_t
vsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VSRA.U32 d0,d0,#32
7942 _NEON2SSE_INLINE uint32x2_t
vsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
) // VSRA.U32 d0,d0,#32
7944 //may be not optimal compared with the serial execution
7946 shift
= vshr_n_u32(b
, c
);
7947 return vadd_u32( a
, shift
);
7950 uint64x1_t
vsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VSRA.U64 d0,d0,#64
7951 _NEON2SSE_INLINE uint64x1_t
vsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
) // VSRA.U64 d0,d0,#64
7953 //may be not optimal compared with the serial execution
7955 shift
= vshr_n_u64(b
, c
);
7956 return vadd_u64(a
, shift
);
7959 int8x16_t
vsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VSRA.S8 q0,q0,#8
7960 _NEON2SSE_INLINE int8x16_t
vsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
) // VSRA.S8 q0,q0,#8
7963 shift
= vshrq_n_s8(b
, c
);
7964 return vaddq_s8(a
, shift
);
7967 int16x8_t
vsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VSRA.S16 q0,q0,#16
7968 _NEON2SSE_INLINE int16x8_t
vsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
) // VSRA.S16 q0,q0,#16
7971 shift
= vshrq_n_s16(b
, c
);
7972 return vaddq_s16(a
, shift
);
7975 int32x4_t
vsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VSRA.S32 q0,q0,#32
7976 _NEON2SSE_INLINE int32x4_t
vsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
) // VSRA.S32 q0,q0,#32
7979 shift
= vshrq_n_s32(b
, c
);
7980 return vaddq_s32(a
, shift
);
7983 int64x2_t
vsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VSRA.S64 q0,q0,#64
7984 _NEON2SSE_INLINE int64x2_t
vsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
) // VSRA.S64 q0,q0,#64
7987 shift
= vshrq_n_s64(b
, c
);
7988 return vaddq_s64( a
, shift
);
7991 uint8x16_t
vsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VSRA.U8 q0,q0,#8
7992 _NEON2SSE_INLINE uint8x16_t
vsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
) // VSRA.U8 q0,q0,#8
7995 shift
= vshrq_n_u8(b
, c
);
7996 return vaddq_u8(a
, shift
);
7999 uint16x8_t
vsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VSRA.s16 q0,q0,#16
8000 _NEON2SSE_INLINE uint16x8_t
vsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
) // VSRA.s16 q0,q0,#16
8003 shift
= vshrq_n_u16(b
, c
);
8004 return vaddq_u16(a
, shift
);
8007 uint32x4_t
vsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VSRA.U32 q0,q0,#32
8008 _NEON2SSE_INLINE uint32x4_t
vsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
) // VSRA.U32 q0,q0,#32
8011 shift
= vshrq_n_u32(b
, c
);
8012 return vaddq_u32(a
, shift
);
8015 uint64x2_t
vsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VSRA.U64 q0,q0,#64
8016 _NEON2SSE_INLINE uint64x2_t
vsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
) // VSRA.U64 q0,q0,#64
8019 shift
= vshrq_n_u64(b
, c
);
8020 return vaddq_u64(a
, shift
);
8023 //************* Vector rounding shift right by constant and accumulate ****************************
8024 //************************************************************************************************
8025 int8x8_t
vrsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VRSRA.S8 d0,d0,#8
8026 _NEON2SSE_INLINE int8x8_t
vrsra_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
) // VRSRA.S8 d0,d0,#8
8029 shift
= vrshr_n_s8(b
, c
);
8030 return vadd_s8( a
, shift
);
8033 int16x4_t
vrsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VRSRA.S16 d0,d0,#16
8034 _NEON2SSE_INLINE int16x4_t
vrsra_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
) // VRSRA.S16 d0,d0,#16
8037 shift
= vrshr_n_s16( b
, c
);
8038 return vadd_s16(a
, shift
);
8041 int32x2_t
vrsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VRSRA.S32 d0,d0,#32
8042 _NEON2SSE_INLINE int32x2_t
vrsra_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
) // VRSRA.S32 d0,d0,#32
8044 //may be not optimal compared with the serial execution
8046 shift
= vrshr_n_s32(b
, c
);
8047 return vadd_s32( a
, shift
);
8050 int64x1_t
vrsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VRSRA.S64 d0,d0,#64
8051 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vrsra_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution
8054 shift
= vrshr_n_s64(b
, c
);
8055 return vadd_s64( a
, shift
);
8058 uint8x8_t
vrsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VRSRA.U8 d0,d0,#8
8059 _NEON2SSE_INLINE uint8x8_t
vrsra_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
) // VRSRA.U8 d0,d0,#8
8062 shift
= vrshr_n_u8(b
, c
);
8063 return vadd_u8(a
, shift
);
8066 uint16x4_t
vrsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VRSRA.s16 d0,d0,#16
8067 _NEON2SSE_INLINE uint16x4_t
vrsra_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
) // VRSRA.s16 d0,d0,#16
8070 shift
= vrshr_n_u16(b
, c
);
8071 return vadd_u16(a
,shift
);
8074 uint32x2_t
vrsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VRSRA.U32 d0,d0,#32
8075 _NEON2SSE_INLINE uint32x2_t
vrsra_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
) // VRSRA.U32 d0,d0,#32
8077 //may be not optimal compared with the serial execution
8079 shift
= vrshr_n_u32(b
, c
);
8080 return vadd_u32( a
, shift
);
8083 uint64x1_t
vrsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VRSRA.U64 d0,d0,#64
8084 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vrsra_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution
8086 //may be not optimal compared with the serial execution
8088 shift
= vrshr_n_u64(b
, c
);
8089 return vadd_u64( a
, shift
);
8092 int8x16_t
vrsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VRSRA.S8 q0,q0,#8
8093 _NEON2SSE_INLINE int8x16_t
vrsraq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
) // VRSRA.S8 q0,q0,#8
8096 shift
= vrshrq_n_s8(b
, c
);
8097 return vaddq_s8(a
, shift
);
8100 int16x8_t
vrsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VRSRA.S16 q0,q0,#16
8101 _NEON2SSE_INLINE int16x8_t
vrsraq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
) // VRSRA.S16 q0,q0,#16
8104 shift
= vrshrq_n_s16(b
, c
);
8105 return vaddq_s16(a
, shift
);
8108 int32x4_t
vrsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VRSRA.S32 q0,q0,#32
8109 _NEON2SSE_INLINE int32x4_t
vrsraq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
) // VRSRA.S32 q0,q0,#32
8112 shift
= vrshrq_n_s32(b
, c
);
8113 return vaddq_s32(a
, shift
);
8116 int64x2_t
vrsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VRSRA.S64 q0,q0,#64
8117 _NEON2SSE_INLINE int64x2_t
vrsraq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
)
8120 shift
= vrshrq_n_s64(b
, c
);
8121 return vaddq_s64(a
, shift
);
8124 uint8x16_t
vrsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VRSRA.U8 q0,q0,#8
8125 _NEON2SSE_INLINE uint8x16_t
vrsraq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
) // VRSRA.U8 q0,q0,#8
8128 shift
= vrshrq_n_u8(b
, c
);
8129 return vaddq_u8(a
, shift
);
8132 uint16x8_t
vrsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VRSRA.s16 q0,q0,#16
8133 _NEON2SSE_INLINE uint16x8_t
vrsraq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
) // VRSRA.s16 q0,q0,#16
8136 shift
= vrshrq_n_u16(b
, c
);
8137 return vaddq_u16(a
, shift
);
8140 uint32x4_t
vrsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VRSRA.U32 q0,q0,#32
8141 _NEON2SSE_INLINE uint32x4_t
vrsraq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
) // VRSRA.U32 q0,q0,#32
8144 shift
= vrshrq_n_u32(b
, c
);
8145 return vaddq_u32(a
, shift
);
8148 uint64x2_t
vrsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VRSRA.U64 q0,q0,#64
8149 _NEON2SSE_INLINE uint64x2_t
vrsraq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
)
8152 shift
= vrshrq_n_u64(b
, c
);
8153 return vaddq_u64(a
, shift
);
8156 //**********************Vector saturating shift left by constant *****************************
8157 //********************************************************************************************
8158 //we don't check const ranges assuming they are met
8159 int8x8_t
vqshl_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VQSHL.S8 d0,d0,#0
8160 _NEON2SSE_INLINE int8x8_t
vqshl_n_s8(int8x8_t a
, __constrange(0,7) int b
) // VQSHL.S8 d0,d0,#0
8162 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8165 a128
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
8166 r128
= _mm_slli_epi16 (a128
, b
);
8167 r128
= _mm_packs_epi16 (r128
,r128
); //saturated s8, use 64 low bits only
8171 int16x4_t
vqshl_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VQSHL.S16 d0,d0,#0
8172 _NEON2SSE_INLINE int16x4_t
vqshl_n_s16(int16x4_t a
, __constrange(0,15) int b
) // VQSHL.S16 d0,d0,#0
8174 // go to 32 bit to get the auto saturation (in packs function)
8177 a128
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); //SSE 4.1
8178 r128
= _mm_slli_epi32 (a128
, b
); //shift_res
8179 r128
= _mm_packs_epi32 (r128
,r128
); //saturated s16, use 64 low bits only
8183 int32x2_t
vqshl_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VQSHL.S32 d0,d0,#0
8184 _NEON2SSE_INLINE int32x2_t
vqshl_n_s32(int32x2_t a
, __constrange(0,31) int b
)
8186 //serial execution may be faster
8188 return64(vqshlq_n_s32 (_pM128i(a
), b
));
8192 int64x1_t
vqshl_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VQSHL.S64 d0,d0,#0
8193 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x1_t
vqshl_n_s64(int64x1_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
8195 // no effective SIMD solution here
8198 int64_t a_i64
= *( int64_t*)&a
;
8199 bmask
= ( int64_t)1 << (63 - b
); //positive
8200 if (a_i64
>= bmask
) {
8201 res
.m64_i64
[0] = ~(_SIGNBIT64
);
8203 res
.m64_i64
[0] = (a_i64
<= -bmask
) ? _SIGNBIT64
: a_i64
<< b
;
8209 uint8x8_t
vqshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
); // VQSHL.U8 d0,d0,#0
8210 _NEON2SSE_INLINE uint8x8_t
vqshl_n_u8(uint8x8_t a
, __constrange(0,7) int b
) // VQSHL.U8 d0,d0,#0
8212 //no 8 bit shift available in IA32 SIMD, go to 16 bit
8215 a128
= _MM_CVTEPU8_EPI16 (_pM128i(a
)); //SSE 4.1
8216 r128
= _mm_slli_epi16 (a128
, b
); //shift_res
8217 r128
= _mm_packus_epi16 (r128
,r128
); //saturated u8, use 64 low bits only
8221 uint16x4_t
vqshl_n_u16(uint16x4_t a
, __constrange(0,15) int b
); // VQSHL.s16 d0,d0,#0
8222 _NEON2SSE_INLINE uint16x4_t
vqshl_n_u16(uint16x4_t a
, __constrange(0,15) int b
) // VQSHL.s16 d0,d0,#0
8224 // go to 32 bit to get the auto saturation (in packus function)
8227 a128
= _MM_CVTEPU16_EPI32 (_pM128i(a
)); //SSE 4.1
8228 r128
= _mm_slli_epi32 (a128
, b
); //shift_res
8229 r128
= _MM_PACKUS1_EPI32 (r128
); //saturated s16
8233 uint32x2_t
vqshl_n_u32(uint32x2_t a
, __constrange(0,31) int b
); // VQSHL.U32 d0,d0,#0
8234 _NEON2SSE_INLINE uint32x2_t
vqshl_n_u32(uint32x2_t a
, __constrange(0,31) int b
)
8237 return64(vqshlq_n_u32(_pM128i(a
), b
));
8240 uint64x1_t
vqshl_n_u64(uint64x1_t a
, __constrange(0,63) int b
); // VQSHL.U64 d0,d0,#0
8241 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqshl_n_u64(uint64x1_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
8243 // no effective SIMD solution here
8246 uint64_t a_i64
= *(uint64_t*)&a
;
8247 bmask
= ( uint64_t)1 << (64 - b
);
8248 res
.m64_u64
[0] = (a_i64
>= bmask
)&&(b
>0) ? 0xffffffffffffffff : a_i64
<< b
; //if b=0 we are fine with any a
8252 int8x16_t
vqshlq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VQSHL.S8 q0,q0,#0
8253 _NEON2SSE_INLINE int8x16_t
vqshlq_n_s8(int8x16_t a
, __constrange(0,7) int b
) // VQSHL.S8 q0,q0,#0
8255 // go to 16 bit to get the auto saturation (in packs function)
8256 __m128i a128
, r128_1
, r128_2
;
8257 a128
= _MM_CVTEPI8_EPI16 (a
); //SSE 4.1
8258 r128_1
= _mm_slli_epi16 (a128
, b
);
8259 //swap hi and low part of a128 to process the remaining data
8260 a128
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
8261 a128
= _MM_CVTEPI8_EPI16 (a128
);
8262 r128_2
= _mm_slli_epi16 (a128
, b
);
8263 return _mm_packs_epi16 (r128_1
, r128_2
); //saturated s8
8266 int16x8_t
vqshlq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VQSHL.S16 q0,q0,#0
8267 _NEON2SSE_INLINE int16x8_t
vqshlq_n_s16(int16x8_t a
, __constrange(0,15) int b
) // VQSHL.S16 q0,q0,#0
8269 // manual saturation solution looks LESS optimal than 32 bits conversion one
8270 // go to 32 bit to get the auto saturation (in packs function)
8271 __m128i a128
, r128_1
, r128_2
;
8272 a128
= _MM_CVTEPI16_EPI32 (a
); //SSE 4.1
8273 r128_1
= _mm_slli_epi32 (a128
, b
); //shift_res
8274 //swap hi and low part of a128 to process the remaining data
8275 a128
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
8276 a128
= _MM_CVTEPI16_EPI32 (a128
);
8277 r128_2
= _mm_slli_epi32 (a128
, b
);
8278 return _mm_packs_epi32 (r128_1
, r128_2
); //saturated s16
8281 int32x4_t
vqshlq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VQSHL.S32 q0,q0,#0
8282 _NEON2SSE_INLINE int32x4_t
vqshlq_n_s32(int32x4_t a
, __constrange(0,31) int b
) // VQSHL.S32 q0,q0,#0
8284 // no 64 bit saturation option available, special tricks necessary
8285 __m128i c1
, maskA
, saturation_mask
, c7ffffff_mask
, shift_res
, shift_res_mask
;
8286 c1
= _mm_cmpeq_epi32(a
,a
); //0xff..ff
8287 maskA
= _mm_srli_epi32(c1
, b
+ 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
8288 saturation_mask
= _mm_cmpgt_epi32 (a
, maskA
); //0xff...ff if we need saturation, 0 otherwise
8289 c7ffffff_mask
= _mm_srli_epi32(saturation_mask
, 1); //saturated to 0x7f..ff when needed and zeros if not
8290 shift_res
= _mm_slli_epi32 (a
, b
);
8291 shift_res_mask
= _mm_andnot_si128(saturation_mask
, shift_res
);
8292 //result with positive numbers saturated
8293 shift_res
= _mm_or_si128 (c7ffffff_mask
, shift_res_mask
);
8294 //treat negative numbers
8295 maskA
= _mm_slli_epi32(c1
, 31 - b
); //mask for negative numbers b-1 ones and (32-b+1) zeros
8296 saturation_mask
= _mm_cmpgt_epi32 (maskA
,a
); //0xff...ff if we need saturation, 0 otherwise
8297 c7ffffff_mask
= _mm_slli_epi32(saturation_mask
, 31); //saturated to 0x80..00 when needed and zeros if not
8298 shift_res_mask
= _mm_andnot_si128(saturation_mask
, shift_res
);
8299 return _mm_or_si128 (c7ffffff_mask
, shift_res_mask
);
8302 int64x2_t
vqshlq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VQSHL.S64 q0,q0,#0
8303 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqshlq_n_s64(int64x2_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
8305 // no effective SIMD solution here
8306 _NEON2SSE_ALIGN_16
int64_t atmp
[2], res
[2];
8309 bmask
= ( int64_t)1 << (63 - b
); //positive
8310 _mm_store_si128((__m128i
*)atmp
, a
);
8311 for (i
= 0; i
<2; i
++) {
8312 if (atmp
[i
] >= bmask
) {
8313 res
[i
] = ~(_SIGNBIT64
);
8315 res
[i
] = (atmp
[i
] <= -bmask
) ? _SIGNBIT64
: atmp
[i
] << b
;
8318 return _mm_load_si128((__m128i
*)res
);
8321 uint8x16_t
vqshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
); // VQSHL.U8 q0,q0,#0
8322 _NEON2SSE_INLINE uint8x16_t
vqshlq_n_u8(uint8x16_t a
, __constrange(0,7) int b
) // VQSHL.U8 q0,q0,#0
8324 // go to 16 bit to get the auto saturation (in packs function)
8325 __m128i a128
, r128_1
, r128_2
;
8326 a128
= _MM_CVTEPU8_EPI16 (a
); //SSE 4.1
8327 r128_1
= _mm_slli_epi16 (a128
, b
);
8328 //swap hi and low part of a128 to process the remaining data
8329 a128
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
8330 a128
= _MM_CVTEPU8_EPI16 (a128
);
8331 r128_2
= _mm_slli_epi16 (a128
, b
);
8332 return _mm_packus_epi16 (r128_1
, r128_2
); //saturated u8
8335 uint16x8_t
vqshlq_n_u16(uint16x8_t a
, __constrange(0,15) int b
); // VQSHL.s16 q0,q0,#0
8336 _NEON2SSE_INLINE uint16x8_t
vqshlq_n_u16(uint16x8_t a
, __constrange(0,15) int b
) // VQSHL.s16 q0,q0,#0
8338 // manual saturation solution looks more optimal than 32 bits conversion one
8339 __m128i cb
, c8000
, a_signed
, saturation_mask
, shift_res
;
8340 cb
= _mm_set1_epi16((1 << (16 - b
)) - 1 - 0x8000 );
8341 c8000
= _mm_set1_epi16 (0x8000);
8342 //no unsigned shorts comparison in SSE, only signed available, so need the trick
8343 a_signed
= _mm_sub_epi16(a
, c8000
); //go to signed
8344 saturation_mask
= _mm_cmpgt_epi16 (a_signed
, cb
);
8345 shift_res
= _mm_slli_epi16 (a
, b
);
8346 return _mm_or_si128 (shift_res
, saturation_mask
);
8349 uint32x4_t
vqshlq_n_u32(uint32x4_t a
, __constrange(0,31) int b
); // VQSHL.U32 q0,q0,#0
8350 _NEON2SSE_INLINE uint32x4_t
vqshlq_n_u32(uint32x4_t a
, __constrange(0,31) int b
) // VQSHL.U32 q0,q0,#0
8352 // manual saturation solution, no 64 bit saturation option, the serial version may be faster
8353 __m128i cb
, c80000000
, a_signed
, saturation_mask
, shift_res
;
8354 cb
= _mm_set1_epi32((1 << (32 - b
)) - 1 - 0x80000000 );
8355 c80000000
= _mm_set1_epi32 (0x80000000);
8356 //no unsigned ints comparison in SSE, only signed available, so need the trick
8357 a_signed
= _mm_sub_epi32(a
, c80000000
); //go to signed
8358 saturation_mask
= _mm_cmpgt_epi32 (a_signed
, cb
);
8359 shift_res
= _mm_slli_epi32 (a
, b
);
8360 return _mm_or_si128 (shift_res
, saturation_mask
);
8363 uint64x2_t
vqshlq_n_u64(uint64x2_t a
, __constrange(0,63) int b
); // VQSHL.U64 q0,q0,#0
8364 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqshlq_n_u64(uint64x2_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
8366 // no effective SIMD solution here
8367 _NEON2SSE_ALIGN_16
uint64_t atmp
[2], res
[2];
8370 bmask
= ( uint64_t)1 << (64 - b
);
8371 _mm_store_si128((__m128i
*)atmp
, a
);
8372 for (i
= 0; i
<2; i
++) {
8373 res
[i
] = (atmp
[i
] >= bmask
)&&(b
>0) ? 0xffffffffffffffff : atmp
[i
] << b
; //if b=0 we are fine with any a
8375 return _mm_load_si128((__m128i
*)res
);
8378 //**************Vector signed->unsigned saturating shift left by constant *************
8379 //*************************************************************************************
8380 uint8x8_t
vqshlu_n_s8(int8x8_t a
, __constrange(0,7) int b
); // VQSHLU.S8 d0,d0,#0
8381 _NEON2SSE_INLINE uint8x8_t
vqshlu_n_s8(int8x8_t a
, __constrange(0,7) int b
) // VQSHLU.S8 d0,d0,#0
8383 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
8386 a128
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
8387 r128
= _mm_slli_epi16 (a128
, b
);
8388 r128
= _mm_packus_epi16 (r128
,r128
); //saturated u8, use 64 low bits only
8392 uint16x4_t
vqshlu_n_s16(int16x4_t a
, __constrange(0,15) int b
); // VQSHLU.S16 d0,d0,#0
8393 _NEON2SSE_INLINE uint16x4_t
vqshlu_n_s16(int16x4_t a
, __constrange(0,15) int b
) // VQSHLU.S16 d0,d0,#0
8397 a128
= _MM_CVTEPI16_EPI32 (_pM128i(a
)); //SSE 4.1
8398 r128
= _mm_slli_epi32 (a128
, b
); //shift_res
8399 r128
= _MM_PACKUS1_EPI32 (r128
); //saturated s16, use 64 low bits only
8403 uint32x2_t
vqshlu_n_s32(int32x2_t a
, __constrange(0,31) int b
); // VQSHLU.S32 d0,d0,#0
8404 _NEON2SSE_INLINE int32x2_t
vqshlu_n_s32(int32x2_t a
, __constrange(0,31) int b
)
8407 return64( vqshluq_n_s32(_pM128i(a
), b
));
8410 uint64x1_t
vqshlu_n_s64(int64x1_t a
, __constrange(0,63) int b
); // VQSHLU.S64 d0,d0,#0
8411 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x1_t
vqshlu_n_s64(int64x1_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
) // no effective SIMD solution here, serial execution looks faster
8415 if (a
.m64_i64
[0]<=0) {
8418 limit
= (uint64_t) 1 << (64 - b
);
8419 res
.m64_u64
[0] = ( ((uint64_t)a
.m64_i64
[0]) >= limit
) ? res
.m64_u64
[0] = ~((uint64_t)0) : a
.m64_i64
[0] << b
;
8424 uint8x16_t
vqshluq_n_s8(int8x16_t a
, __constrange(0,7) int b
); // VQSHLU.S8 q0,q0,#0
8425 _NEON2SSE_INLINE uint8x16_t
vqshluq_n_s8(int8x16_t a
, __constrange(0,7) int b
) // VQSHLU.S8 q0,q0,#0
8427 __m128i a128
, r128_1
, r128_2
;
8428 a128
= _MM_CVTEPI8_EPI16 (a
); //SSE 4.1
8429 r128_1
= _mm_slli_epi16 (a128
, b
);
8430 //swap hi and low part of a128 to process the remaining data
8431 a128
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
8432 a128
= _MM_CVTEPI8_EPI16 (a128
);
8433 r128_2
= _mm_slli_epi16 (a128
, b
);
8434 return _mm_packus_epi16 (r128_1
, r128_2
); //saturated u8
8437 uint16x8_t
vqshluq_n_s16(int16x8_t a
, __constrange(0,15) int b
); // VQSHLU.S16 q0,q0,#0
8438 _NEON2SSE_INLINE uint16x8_t
vqshluq_n_s16(int16x8_t a
, __constrange(0,15) int b
) // VQSHLU.S16 q0,q0,#0
8440 // manual saturation solution looks LESS optimal than 32 bits conversion one
8441 __m128i a128
, r128_1
, r128_2
;
8442 a128
= _MM_CVTEPI16_EPI32 (a
); //SSE 4.1
8443 r128_1
= _mm_slli_epi32 (a128
, b
); //shift_res
8444 //swap hi and low part of a128 to process the remaining data
8445 a128
= _mm_shuffle_epi32 (a
, _SWAP_HI_LOW32
);
8446 a128
= _MM_CVTEPI16_EPI32 (a128
);
8447 r128_2
= _mm_slli_epi32 (a128
, b
);
8448 return _MM_PACKUS_EPI32 (r128_1
, r128_2
); //saturated s16
8451 uint32x4_t
vqshluq_n_s32(int32x4_t a
, __constrange(0,31) int b
); // VQSHLU.S32 q0,q0,#0
8452 _NEON2SSE_INLINE uint32x4_t
vqshluq_n_s32(int32x4_t a
, __constrange(0,31) int b
) // VQSHLU.S32 q0,q0,#0
8454 //solution may be not optimal compared with the serial one
8455 __m128i zero
, maskA
, maskGT0
, a0
, a_masked
, a_shift
;
8456 zero
= _mm_setzero_si128();
8457 maskA
= _mm_cmpeq_epi32(a
, a
);
8458 maskA
= _mm_slli_epi32(maskA
,(32 - b
)); // b ones and (32-b)zeros
8459 //saturate negative numbers to zero
8460 maskGT0
= _mm_cmpgt_epi32 (a
, zero
); // //0xffffffff if positive number and zero otherwise (negative numbers)
8461 a0
= _mm_and_si128 (a
, maskGT0
); //negative are zeros now
8462 //saturate positive to 0xffffffff
8463 a_masked
= _mm_and_si128 (a0
, maskA
);
8464 a_masked
= _mm_cmpgt_epi32 (a_masked
, zero
); //0xffffffff if saturation necessary 0 otherwise
8465 a_shift
= _mm_slli_epi32 (a0
, b
);
8466 return _mm_or_si128 (a_shift
, a_masked
); //actual saturation
8469 uint64x2_t
vqshluq_n_s64(int64x2_t a
, __constrange(0,63) int b
); // VQSHLU.S64 q0,q0,#0
8470 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint64x2_t
vqshluq_n_s64(int64x2_t a
, __constrange(0,63) int b
), _NEON2SSE_REASON_SLOW_SERIAL
)
8472 // no effective SIMD solution here, serial execution looks faster
8473 _NEON2SSE_ALIGN_16
int64_t atmp
[2];
8474 _NEON2SSE_ALIGN_16
uint64_t res
[2];
8477 _mm_store_si128((__m128i
*)atmp
, a
);
8478 for (i
= 0; i
<2; i
++) {
8482 limit
= (uint64_t) 1 << (64 - b
);
8483 res
[i
] = ( ((uint64_t)atmp
[i
]) >= limit
) ? res
[i
] = ~((uint64_t)0) : atmp
[i
] << b
;
8486 return _mm_load_si128((__m128i
*)res
);
8489 //************** Vector narrowing shift right by constant **************
8490 //**********************************************************************
8491 int8x8_t
vshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VSHRN.I16 d0,q0,#8
8492 _NEON2SSE_INLINE int8x8_t
vshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VSHRN.I16 d0,q0,#8
8496 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8497 r16
= vshrq_n_s16(a
,b
);
8498 r16
= _mm_shuffle_epi8 (r16
, *(__m128i
*) mask8_16_even_odd
); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8502 int16x4_t
vshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VSHRN.I32 d0,q0,#16
8503 _NEON2SSE_INLINE int16x4_t
vshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VSHRN.I32 d0,q0,#16
8507 _NEON2SSE_ALIGN_16
int8_t mask16_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8508 r32
= vshrq_n_s32(a
,b
);
8509 r32
= _mm_shuffle_epi8 (r32
, *(__m128i
*) mask16_odd
); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8513 int32x2_t
vshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VSHRN.I64 d0,q0,#32
8514 _NEON2SSE_INLINE int32x2_t
vshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
)
8518 r64
= vshrq_n_s64(a
,b
);
8519 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8523 uint8x8_t
vshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VSHRN.I16 d0,q0,#8
8524 _NEON2SSE_INLINE uint8x8_t
vshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
) // VSHRN.I16 d0,q0,#8
8528 mask
= _mm_set1_epi16(0xff);
8529 r16
= vshrq_n_s16(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8530 r16
= _mm_and_si128(r16
, mask
); //to avoid saturation
8531 r16
= _mm_packus_epi16 (r16
,r16
); //narrow, use low 64 bits only
8535 uint16x4_t
vshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VSHRN.I32 d0,q0,#16
8536 _NEON2SSE_INLINE uint16x4_t
vshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
) // VSHRN.I32 d0,q0,#16
8540 mask
= _mm_set1_epi32(0xffff);
8541 r32
= vshrq_n_u32(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
8542 r32
= _mm_and_si128(r32
, mask
); //to avoid saturation
8543 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow, use low 64 bits only
8547 uint32x2_t
vshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VSHRN.I64 d0,q0,#32
8548 _NEON2SSE_INLINE uint32x2_t
vshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
)
8552 r64
= vshrq_n_u64(a
,b
);
8553 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8557 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
8558 //*********************************************************************************************
8559 uint8x8_t
vqshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQSHRUN.S16 d0,q0,#8
8560 _NEON2SSE_INLINE uint8x8_t
vqshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VQSHRUN.S16 d0,q0,#8
8564 r16
= vshrq_n_s16(a
,b
);
8565 r16
= _mm_packus_epi16 (r16
,r16
); //saturate and narrow (signed to unsigned), use low 64 bits only
8569 uint16x4_t
vqshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQSHRUN.S32 d0,q0,#16
8570 _NEON2SSE_INLINE uint16x4_t
vqshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VQSHRUN.S32 d0,q0,#16
8574 r32
= vshrq_n_s32(a
,b
);
8575 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow(signed to unsigned), use low 64 bits only
8579 uint32x2_t
vqshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQSHRUN.S64 d0,q0,#32
8580 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vqshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution is faster
8582 _NEON2SSE_ALIGN_16
int64_t atmp
[2];
8585 _mm_store_si128((__m128i
*)atmp
, a
);
8589 res64
= (atmp
[0] >> b
);
8590 res
.m64_u32
[0] = (res64
> (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64
;
8595 res64
= (atmp
[1] >> b
);
8596 res
.m64_u32
[1] = (res64
> (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64
;
8601 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
8602 uint8x8_t
vqrshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQRSHRUN.S16 d0,q0,#8
8603 _NEON2SSE_INLINE uint8x8_t
vqrshrun_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VQRSHRUN.S16 d0,q0,#8
8605 //solution may be not optimal compared with the serial one
8608 r16
= vrshrq_n_s16(a
,b
);
8609 r16
= _mm_packus_epi16 (r16
,r16
); //saturate and narrow (signed to unsigned), use low 64 bits only
8613 uint16x4_t
vqrshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQRSHRUN.S32 d0,q0,#16
8614 _NEON2SSE_INLINE uint16x4_t
vqrshrun_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VQRSHRUN.S32 d0,q0,#16
8616 //solution may be not optimal compared with the serial one
8619 r32
= vrshrq_n_s32(a
,b
);
8620 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow (signed to unsigned), use low 64 bits only
8624 uint32x2_t
vqrshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQRSHRUN.S64 d0,q0,#32
8625 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vqrshrun_n_s64(int64x2_t a
, __constrange(1,32) int b
), _NEON2SSE_REASON_SLOW_SERIAL
) //serial solution is faster
8627 _NEON2SSE_ALIGN_16
int64_t atmp
[2];
8630 _mm_store_si128((__m128i
*)atmp
, a
);
8634 res64
= (atmp
[0] >> b
) + ( (atmp
[0] & ((int64_t)1 << (b
- 1))) >> (b
- 1) );
8635 res
.m64_u32
[0] = (res64
> (int64_t)0xffffffff ) ? 0xffffffff : res64
;
8640 res64
= (atmp
[1] >> b
) + ( (atmp
[0] & ((int64_t)1 << (b
- 1))) >> (b
- 1) );
8641 res
.m64_u32
[1] = (res64
> (int64_t)0xffffffff ) ? 0xffffffff : res64
;
8646 //***** Vector narrowing saturating shift right by constant ******
8647 //*****************************************************************
8648 int8x8_t
vqshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQSHRN.S16 d0,q0,#8
8649 _NEON2SSE_INLINE int8x8_t
vqshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VQSHRN.S16 d0,q0,#8
8653 r16
= vshrq_n_s16(a
,b
);
8654 r16
= _mm_packs_epi16 (r16
,r16
); //saturate and narrow, use low 64 bits only
8658 int16x4_t
vqshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQSHRN.S32 d0,q0,#16
8659 _NEON2SSE_INLINE int16x4_t
vqshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VQSHRN.S32 d0,q0,#16
8663 r32
= vshrq_n_s32(a
,b
);
8664 r32
= _mm_packs_epi32 (r32
,r32
); //saturate and narrow, use low 64 bits only
8668 int32x2_t
vqshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQSHRN.S64 d0,q0,#32
8669 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
8671 //no optimal SIMD solution found
8672 _NEON2SSE_ALIGN_16
int64_t res64
[2], atmp
[2];
8674 _mm_store_si128((__m128i
*)atmp
, a
);
8675 res64
[0] = (atmp
[0] >> b
);
8676 res64
[1] = (atmp
[1] >> b
);
8677 if(res64
[0]>SINT_MAX
) res64
[0] = SINT_MAX
;
8678 if(res64
[0]<SINT_MIN
) res64
[0] = SINT_MIN
;
8679 if(res64
[1]>SINT_MAX
) res64
[1] = SINT_MAX
;
8680 if(res64
[1]<SINT_MIN
) res64
[1] = SINT_MIN
;
8681 res
.m64_i32
[0] = (int32_t)res64
[0];
8682 res
.m64_i32
[1] = (int32_t)res64
[1];
8686 uint8x8_t
vqshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VQSHRN.s16 d0,q0,#8
8687 _NEON2SSE_INLINE uint8x8_t
vqshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
) // VQSHRN.s16 d0,q0,#8
8691 r16
= vshrq_n_u16(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8692 r16
= _mm_packus_epi16 (r16
,r16
); //saturate and narrow, use low 64 bits only
8696 uint16x4_t
vqshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VQSHRN.U32 d0,q0,#16
8697 _NEON2SSE_INLINE uint16x4_t
vqshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
) // VQSHRN.U32 d0,q0,#16
8701 r32
= vshrq_n_u32(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8702 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow, use low 64 bits only
8706 uint32x2_t
vqshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VQSHRN.U64 d0,q0,#32
8707 _NEON2SSE_INLINE uint32x2_t
vqshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
)
8709 //serial solution may be faster
8711 __m128i r64
, res_hi
, zero
;
8712 zero
= _mm_setzero_si128();
8713 r64
= vshrq_n_u64(a
,b
);
8714 res_hi
= _mm_srli_epi64(r64
, 32);
8715 res_hi
= _mm_cmpgt_epi32(res_hi
, zero
);
8716 r64
= _mm_or_si128(r64
, res_hi
);
8717 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8722 //********* Vector rounding narrowing shift right by constant *************************
8723 //****************************************************************************************
8724 int8x8_t
vrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VRSHRN.I16 d0,q0,#8
8725 _NEON2SSE_INLINE int8x8_t
vrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VRSHRN.I16 d0,q0,#8
8729 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8730 r16
= vrshrq_n_s16(a
,b
);
8731 r16
= _mm_shuffle_epi8 (r16
, *(__m128i
*) mask8_16_even_odd
); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8735 int16x4_t
vrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VRSHRN.I32 d0,q0,#16
8736 _NEON2SSE_INLINE int16x4_t
vrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VRSHRN.I32 d0,q0,#16
8740 _NEON2SSE_ALIGN_16
int8_t mask16_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
8741 r32
= vrshrq_n_s32(a
,b
);
8742 r32
= _mm_shuffle_epi8 (r32
, *(__m128i
*) mask16_odd
); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
8746 int32x2_t
vrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VRSHRN.I64 d0,q0,#32
8747 _NEON2SSE_INLINE int32x2_t
vrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
)
8751 r64
= vrshrq_n_s64(a
,b
);
8752 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8756 uint8x8_t
vrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VRSHRN.I16 d0,q0,#8
8757 _NEON2SSE_INLINE uint8x8_t
vrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
) // VRSHRN.I16 d0,q0,#8
8761 mask
= _mm_set1_epi16(0xff);
8762 r16
= vrshrq_n_s16(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8763 r16
= _mm_and_si128(r16
, mask
); //to avoid saturation
8764 r16
= _mm_packus_epi16 (r16
,r16
); //saturate and narrow, use low 64 bits only
8768 uint16x4_t
vrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VRSHRN.I32 d0,q0,#16
8769 _NEON2SSE_INLINE uint16x4_t
vrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
) // VRSHRN.I32 d0,q0,#16
8773 mask
= _mm_set1_epi32(0xffff);
8774 r32
= vrshrq_n_u32(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8775 r32
= _mm_and_si128(r32
, mask
); //to avoid saturation
8776 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow, use low 64 bits only
8780 uint32x2_t
vrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VRSHRN.I64 d0,q0,#32
8781 _NEON2SSE_INLINE uint32x2_t
vrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
) //serial solution may be faster
8785 r64
= vrshrq_n_u64(a
,b
);
8786 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8790 //************* Vector rounding narrowing saturating shift right by constant ************
8791 //****************************************************************************************
8792 int8x8_t
vqrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
); // VQRSHRN.S16 d0,q0,#8
8793 _NEON2SSE_INLINE int8x8_t
vqrshrn_n_s16(int16x8_t a
, __constrange(1,8) int b
) // VQRSHRN.S16 d0,q0,#8
8797 r16
= vrshrq_n_s16(a
,b
);
8798 r16
= _mm_packs_epi16 (r16
,r16
); //saturate and narrow, use low 64 bits only
8802 int16x4_t
vqrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
); // VQRSHRN.S32 d0,q0,#16
8803 _NEON2SSE_INLINE int16x4_t
vqrshrn_n_s32(int32x4_t a
, __constrange(1,16) int b
) // VQRSHRN.S32 d0,q0,#16
8807 r32
= vrshrq_n_s32(a
,b
);
8808 r32
= _mm_packs_epi32 (r32
,r32
); //saturate and narrow, use low 64 bits only
8812 int32x2_t
vqrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
); // VQRSHRN.S64 d0,q0,#32
8813 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqrshrn_n_s64(int64x2_t a
, __constrange(1,32) int b
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
8815 //no optimal SIMD solution found
8816 _NEON2SSE_ALIGN_16
int64_t res64
[2], atmp
[2], maskb
[2];
8818 _mm_store_si128((__m128i
*)atmp
, a
);
8819 maskb
[0] = atmp
[0] & (( int64_t)1 << (b
- 1));
8820 res64
[0] = (atmp
[0] >> b
) + (maskb
[0] >> (b
- 1)); //rounded result
8821 maskb
[1] = atmp
[1] & (( int64_t)1 << (b
- 1));
8822 res64
[1] = (atmp
[1] >> b
) + (maskb
[1] >> (b
- 1)); //rounded result
8823 if(res64
[0]>SINT_MAX
) res64
[0] = SINT_MAX
;
8824 if(res64
[0]<SINT_MIN
) res64
[0] = SINT_MIN
;
8825 if(res64
[1]>SINT_MAX
) res64
[1] = SINT_MAX
;
8826 if(res64
[1]<SINT_MIN
) res64
[1] = SINT_MIN
;
8827 res
.m64_i32
[0] = (int32_t)res64
[0];
8828 res
.m64_i32
[1] = (int32_t)res64
[1];
8832 uint8x8_t
vqrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
); // VQRSHRN.s16 d0,q0,#8
8833 _NEON2SSE_INLINE uint8x8_t
vqrshrn_n_u16(uint16x8_t a
, __constrange(1,8) int b
) // VQRSHRN.s16 d0,q0,#8
8837 r16
= vrshrq_n_u16(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
8838 r16
= _mm_packus_epi16 (r16
,r16
); //saturate and narrow, use low 64 bits only
8842 uint16x4_t
vqrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
); // VQRSHRN.U32 d0,q0,#16
8843 _NEON2SSE_INLINE uint16x4_t
vqrshrn_n_u32(uint32x4_t a
, __constrange(1,16) int b
) // VQRSHRN.U32 d0,q0,#16
8847 r32
= vrshrq_n_u32(a
,b
); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
8848 r32
= _MM_PACKUS1_EPI32 (r32
); //saturate and narrow, use low 64 bits only
8852 uint32x2_t
vqrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
); // VQRSHRN.U64 d0,q0,#32
8853 _NEON2SSE_INLINE uint32x2_t
vqrshrn_n_u64(uint64x2_t a
, __constrange(1,32) int b
)
8855 //serial solution may be faster
8857 __m128i r64
, res_hi
, zero
;
8858 zero
= _mm_setzero_si128();
8859 r64
= vrshrq_n_u64(a
,b
);
8860 res_hi
= _mm_srli_epi64(r64
, 32);
8861 res_hi
= _mm_cmpgt_epi32(res_hi
, zero
);
8862 r64
= _mm_or_si128(r64
, res_hi
);
8863 r64
= _mm_shuffle_epi32(r64
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
8867 //************** Vector widening shift left by constant ****************
8868 //************************************************************************
8869 int16x8_t
vshll_n_s8(int8x8_t a
, __constrange(0,8) int b
); // VSHLL.S8 q0,d0,#0
8870 _NEON2SSE_INLINE int16x8_t
vshll_n_s8(int8x8_t a
, __constrange(0,8) int b
) // VSHLL.S8 q0,d0,#0
8873 r
= _MM_CVTEPI8_EPI16 (_pM128i(a
)); //SSE 4.1
8874 return _mm_slli_epi16 (r
, b
);
8877 int32x4_t
vshll_n_s16(int16x4_t a
, __constrange(0,16) int b
); // VSHLL.S16 q0,d0,#0
8878 _NEON2SSE_INLINE int32x4_t
vshll_n_s16(int16x4_t a
, __constrange(0,16) int b
) // VSHLL.S16 q0,d0,#0
8881 r
= _MM_CVTEPI16_EPI32(_pM128i(a
)); //SSE4.1,
8882 return _mm_slli_epi32 (r
, b
);
8885 int64x2_t
vshll_n_s32(int32x2_t a
, __constrange(0,32) int b
); // VSHLL.S32 q0,d0,#0
8886 _NEON2SSE_INLINE int64x2_t
vshll_n_s32(int32x2_t a
, __constrange(0,32) int b
) // VSHLL.S32 q0,d0,#0
8889 r
= _MM_CVTEPI32_EPI64(_pM128i(a
)); //SSE4.1,
8890 return _mm_slli_epi64 (r
, b
);
8893 uint16x8_t
vshll_n_u8(uint8x8_t a
, __constrange(0,8) int b
); // VSHLL.U8 q0,d0,#0
8894 _NEON2SSE_INLINE uint16x8_t
vshll_n_u8(uint8x8_t a
, __constrange(0,8) int b
) // VSHLL.U8 q0,d0,#0
8896 //no uint8 to uint16 conversion available, manual conversion used
8898 zero
= _mm_setzero_si128 ();
8899 r
= _mm_unpacklo_epi8(_pM128i(a
), zero
);
8900 return _mm_slli_epi16 (r
, b
);
8903 uint32x4_t
vshll_n_u16(uint16x4_t a
, __constrange(0,16) int b
); // VSHLL.s16 q0,d0,#0
8904 _NEON2SSE_INLINE uint32x4_t
vshll_n_u16(uint16x4_t a
, __constrange(0,16) int b
) // VSHLL.s16 q0,d0,#0
8906 //no uint16 to uint32 conversion available, manual conversion used
8908 zero
= _mm_setzero_si128 ();
8909 r
= _mm_unpacklo_epi16(_pM128i(a
), zero
);
8910 return _mm_slli_epi32 (r
, b
);
8913 uint64x2_t
vshll_n_u32(uint32x2_t a
, __constrange(0,32) int b
); // VSHLL.U32 q0,d0,#0
8914 _NEON2SSE_INLINE uint64x2_t
vshll_n_u32(uint32x2_t a
, __constrange(0,32) int b
) // VSHLL.U32 q0,d0,#0
8916 //no uint32 to uint64 conversion available, manual conversion used
8918 zero
= _mm_setzero_si128 ();
8919 r
= _mm_unpacklo_epi32(_pM128i(a
), zero
);
8920 return _mm_slli_epi64 (r
, b
);
8923 //************************************************************************************
8924 //**************************** Shifts with insert ************************************
8925 //************************************************************************************
8926 //takes each element in a vector, shifts them by an immediate value,
8927 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
8929 //**************** Vector shift right and insert ************************************
8930 //Actually the "c" left bits from "a" are the only bits remained from "a" after the shift.
8931 //All other bits are taken from b shifted.
8932 int8x8_t
vsri_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
8933 _NEON2SSE_INLINE int8x8_t
vsri_n_s8(int8x8_t a
, int8x8_t b
, __constrange(1,8) int c
)
8936 return64(vsriq_n_s8(_pM128i(a
),_pM128i(b
), c
));
8940 int16x4_t
vsri_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
8941 _NEON2SSE_INLINE int16x4_t
vsri_n_s16(int16x4_t a
, int16x4_t b
, __constrange(1,16) int c
)
8944 return64(vsriq_n_s16(_pM128i(a
),_pM128i(b
), c
));
8948 int32x2_t
vsri_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
); // VSRI.32 d0,d0,#32
8949 _NEON2SSE_INLINE int32x2_t
vsri_n_s32(int32x2_t a
, int32x2_t b
, __constrange(1,32) int c
)
8952 return64(vsriq_n_s32(_pM128i(a
),_pM128i(b
), c
));
8956 int64x1_t
vsri_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
); // VSRI.64 d0,d0,#64
8957 _NEON2SSE_INLINE int64x1_t
vsri_n_s64(int64x1_t a
, int64x1_t b
, __constrange(1,64) int c
)
8963 res
.m64_i64
[0] = (b
.m64_u64
[0] >> c
) | ((a
.m64_i64
[0] >> (64 - c
)) << (64 - c
)); //treat b as unsigned for shift to get leading zeros
8968 uint8x8_t
vsri_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
8969 #define vsri_n_u8 vsri_n_s8
8971 uint16x4_t
vsri_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
8972 #define vsri_n_u16 vsri_n_s16
8974 uint32x2_t
vsri_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(1,32) int c
); // VSRI.32 d0,d0,#32
8975 #define vsri_n_u32 vsri_n_s32
8978 uint64x1_t
vsri_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(1,64) int c
); // VSRI.64 d0,d0,#64
8979 #define vsri_n_u64 vsri_n_s64
8981 poly8x8_t
vsri_n_p8(poly8x8_t a
, poly8x8_t b
, __constrange(1,8) int c
); // VSRI.8 d0,d0,#8
8982 #define vsri_n_p8 vsri_n_u8
8984 poly16x4_t
vsri_n_p16(poly16x4_t a
, poly16x4_t b
, __constrange(1,16) int c
); // VSRI.16 d0,d0,#16
8985 #define vsri_n_p16 vsri_n_u16
8987 int8x16_t
vsriq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
8988 _NEON2SSE_INLINE int8x16_t
vsriq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(1,8) int c
) // VSRI.8 q0,q0,#8
8990 __m128i maskA
, a_masked
;
8992 _NEON2SSE_ALIGN_16
uint8_t maskLeft
[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
8993 maskA
= _mm_set1_epi8(maskLeft
[c
]); // c ones and (8-c)zeros
8994 a_masked
= _mm_and_si128 (a
, maskA
);
8995 b_shift
= vshrq_n_u8( b
, c
); // c zeros on the left in b due to logical shift
8996 return _mm_or_si128 (a_masked
, b_shift
); //combine (insert b into a)
8999 int16x8_t
vsriq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
9000 _NEON2SSE_INLINE int16x8_t
vsriq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(1,16) int c
) // VSRI.16 q0,q0,#16
9002 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9005 b_shift
= vshrq_n_u16( b
, c
); // c zeros on the left in b due to logical shift
9006 a_c
= vshrq_n_u16( a
, (16 - c
));
9007 a_c
= _mm_slli_epi16(a_c
, (16 - c
)); //logical shift provides right "c" bits zeros in a
9008 return _mm_or_si128 (a_c
, b_shift
); //combine (insert b into a)
9011 int32x4_t
vsriq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
); // VSRI.32 q0,q0,#32
9012 _NEON2SSE_INLINE int32x4_t
vsriq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(1,32) int c
) // VSRI.32 q0,q0,#32
9014 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a
9017 b_shift
= vshrq_n_u32( b
, c
); // c zeros on the left in b due to logical shift
9018 a_c
= vshrq_n_u32( a
, (32 - c
));
9019 a_c
= _mm_slli_epi32(a_c
, (32 - c
)); //logical shift provides right "c" bits zeros in a
9020 return _mm_or_si128 (a_c
, b_shift
); //combine (insert b into a)
9023 int64x2_t
vsriq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
); // VSRI.64 q0,q0,#64
9024 _NEON2SSE_INLINE int64x2_t
vsriq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(1,64) int c
)
9026 //serial solution may be faster
9029 b_shift
= _mm_srli_epi64(b
, c
); // c zeros on the left in b due to logical shift
9030 a_c
= _mm_srli_epi64(a
, (64 - c
));
9031 a_c
= _mm_slli_epi64(a_c
, (64 - c
)); //logical shift provides right "c" bits zeros in a
9032 return _mm_or_si128 (a_c
, b_shift
); //combine (insert b into a)
9035 uint8x16_t
vsriq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
9036 #define vsriq_n_u8 vsriq_n_s8
9038 uint16x8_t
vsriq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
9039 #define vsriq_n_u16 vsriq_n_s16
9041 uint32x4_t
vsriq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(1,32) int c
); // VSRI.32 q0,q0,#32
9042 #define vsriq_n_u32 vsriq_n_s32
9044 uint64x2_t
vsriq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(1,64) int c
); // VSRI.64 q0,q0,#64
9045 #define vsriq_n_u64 vsriq_n_s64
9047 poly8x16_t
vsriq_n_p8(poly8x16_t a
, poly8x16_t b
, __constrange(1,8) int c
); // VSRI.8 q0,q0,#8
9048 #define vsriq_n_p8 vsriq_n_u8
9050 poly16x8_t
vsriq_n_p16(poly16x8_t a
, poly16x8_t b
, __constrange(1,16) int c
); // VSRI.16 q0,q0,#16
9051 #define vsriq_n_p16 vsriq_n_u16
9053 //***** Vector shift left and insert *********************************************
9054 //*********************************************************************************
9055 //Actually the "c" right bits from "a" are the only bits remained from "a" after the shift.
9056 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
9057 int8x8_t
vsli_n_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
9058 _NEON2SSE_INLINE int8x8_t
vsli_n_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
)
9061 return64(vsliq_n_s8(_pM128i(a
),_pM128i(b
), c
));
9065 int16x4_t
vsli_n_s16(int16x4_t a
, int16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
9066 _NEON2SSE_INLINE int16x4_t
vsli_n_s16(int16x4_t a
, int16x4_t b
, __constrange(0,15) int c
)
9069 return64(vsliq_n_s16(_pM128i(a
),_pM128i(b
), c
));
9073 int32x2_t
vsli_n_s32(int32x2_t a
, int32x2_t b
, __constrange(0,31) int c
); // VSLI.32 d0,d0,#0
9074 _NEON2SSE_INLINE int32x2_t
vsli_n_s32(int32x2_t a
, int32x2_t b
, __constrange(0,31) int c
)
9077 return64(vsliq_n_s32(_pM128i(a
),_pM128i(b
), c
));
9080 int64x1_t
vsli_n_s64(int64x1_t a
, int64x1_t b
, __constrange(0,63) int c
); // VSLI.64 d0,d0,#0
9081 _NEON2SSE_INLINE int64x1_t
vsli_n_s64(int64x1_t a
, int64x1_t b
, __constrange(0,63) int c
)
9084 res
.m64_i64
[0] = (b
.m64_i64
[0] << c
) | ((a
.m64_u64
[0] << (64 - c
)) >> (64 - c
)); //need to treat a as unsigned to get leading zeros
9089 uint8x8_t
vsli_n_u8(uint8x8_t a
, uint8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
9090 #define vsli_n_u8 vsli_n_s8
9092 uint16x4_t
vsli_n_u16(uint16x4_t a
, uint16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
9093 #define vsli_n_u16 vsli_n_s16
9095 uint32x2_t
vsli_n_u32(uint32x2_t a
, uint32x2_t b
, __constrange(0,31) int c
); // VSLI.32 d0,d0,#0
9096 #define vsli_n_u32 vsli_n_s32
9098 uint64x1_t
vsli_n_u64(uint64x1_t a
, uint64x1_t b
, __constrange(0,63) int c
); // VSLI.64 d0,d0,#0
9099 #define vsli_n_u64 vsli_n_s64
9101 poly8x8_t
vsli_n_p8(poly8x8_t a
, poly8x8_t b
, __constrange(0,7) int c
); // VSLI.8 d0,d0,#0
9102 #define vsli_n_p8 vsli_n_u8
9104 poly16x4_t
vsli_n_p16(poly16x4_t a
, poly16x4_t b
, __constrange(0,15) int c
); // VSLI.16 d0,d0,#0
9105 #define vsli_n_p16 vsli_n_u16
9107 int8x16_t
vsliq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
9108 _NEON2SSE_INLINE int8x16_t
vsliq_n_s8(int8x16_t a
, int8x16_t b
, __constrange(0,7) int c
) // VSLI.8 q0,q0,#0
9110 __m128i maskA
, a_masked
;
9112 _NEON2SSE_ALIGN_16
uint8_t maskRight
[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
9113 maskA
= _mm_set1_epi8(maskRight
[c
]); // (8-c)zeros and c ones
9114 b_shift
= vshlq_n_s8( b
, c
);
9115 a_masked
= _mm_and_si128 (a
, maskA
);
9116 return _mm_or_si128 (b_shift
, a_masked
); //combine (insert b into a)
9119 int16x8_t
vsliq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
9120 _NEON2SSE_INLINE int16x8_t
vsliq_n_s16(int16x8_t a
, int16x8_t b
, __constrange(0,15) int c
) // VSLI.16 q0,q0,#0
9122 //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
9125 b_shift
= vshlq_n_s16( b
, c
);
9126 a_c
= vshlq_n_s16( a
, (16 - c
));
9127 a_c
= _mm_srli_epi16(a_c
, (16 - c
));
9128 return _mm_or_si128 (b_shift
, a_c
); //combine (insert b into a)
9131 int32x4_t
vsliq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(0,31) int c
); // VSLI.32 q0,q0,#0
9132 _NEON2SSE_INLINE int32x4_t
vsliq_n_s32(int32x4_t a
, int32x4_t b
, __constrange(0,31) int c
) // VSLI.32 q0,q0,#0
9134 //solution may be not optimal compared with the serial one
9135 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
9138 b_shift
= vshlq_n_s32( b
, c
);
9139 a_c
= vshlq_n_s32( a
, (32 - c
));
9140 a_c
= _mm_srli_epi32(a_c
, (32 - c
));
9141 return _mm_or_si128 (b_shift
, a_c
); //combine (insert b into a)
9144 int64x2_t
vsliq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(0,63) int c
); // VSLI.64 q0,q0,#0
9145 _NEON2SSE_INLINE int64x2_t
vsliq_n_s64(int64x2_t a
, int64x2_t b
, __constrange(0,63) int c
) // VSLI.64 q0,q0,#0
9147 //solution may be not optimal compared with the serial one
9148 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
9151 b_shift
= vshlq_n_s64( b
, c
);
9152 a_c
= vshlq_n_s64( a
, (64 - c
));
9153 a_c
= _mm_srli_epi64(a_c
, (64 - c
));
9154 return _mm_or_si128 (b_shift
, a_c
); //combine (insert b into a)
9157 uint8x16_t
vsliq_n_u8(uint8x16_t a
, uint8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
9158 #define vsliq_n_u8 vsliq_n_s8
9160 uint16x8_t
vsliq_n_u16(uint16x8_t a
, uint16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
9161 #define vsliq_n_u16 vsliq_n_s16
9163 uint32x4_t
vsliq_n_u32(uint32x4_t a
, uint32x4_t b
, __constrange(0,31) int c
); // VSLI.32 q0,q0,#0
9164 #define vsliq_n_u32 vsliq_n_s32
9166 uint64x2_t
vsliq_n_u64(uint64x2_t a
, uint64x2_t b
, __constrange(0,63) int c
); // VSLI.64 q0,q0,#0
9167 #define vsliq_n_u64 vsliq_n_s64
9169 poly8x16_t
vsliq_n_p8(poly8x16_t a
, poly8x16_t b
, __constrange(0,7) int c
); // VSLI.8 q0,q0,#0
9170 #define vsliq_n_p8 vsliq_n_u8
9172 poly16x8_t
vsliq_n_p16(poly16x8_t a
, poly16x8_t b
, __constrange(0,15) int c
); // VSLI.16 q0,q0,#0
9173 #define vsliq_n_p16 vsliq_n_u16
9175 // ***********************************************************************************************
9176 // ****************** Loads and stores of a single vector ***************************************
9177 // ***********************************************************************************************
9178 //Performs loads and stores of a single vector of some type.
9179 //******************************* Loads ********************************************************
9180 // ***********************************************************************************************
9181 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
9182 //also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
9183 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
9184 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
9185 #define LOAD_SI128(ptr) \
9186 ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
9188 uint8x16_t
vld1q_u8(__transfersize(16) uint8_t const * ptr
); // VLD1.8 {d0, d1}, [r0]
9189 #define vld1q_u8 LOAD_SI128
9191 uint16x8_t
vld1q_u16(__transfersize(8) uint16_t const * ptr
); // VLD1.16 {d0, d1}, [r0]
9192 #define vld1q_u16 LOAD_SI128
9194 uint32x4_t
vld1q_u32(__transfersize(4) uint32_t const * ptr
); // VLD1.32 {d0, d1}, [r0]
9195 #define vld1q_u32 LOAD_SI128
9197 uint64x2_t
vld1q_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
9198 #define vld1q_u64 LOAD_SI128
9200 int8x16_t
vld1q_s8(__transfersize(16) int8_t const * ptr
); // VLD1.8 {d0, d1}, [r0]
9201 #define vld1q_s8 LOAD_SI128
9203 int16x8_t
vld1q_s16(__transfersize(8) int16_t const * ptr
); // VLD1.16 {d0, d1}, [r0]
9204 #define vld1q_s16 LOAD_SI128
9206 int32x4_t
vld1q_s32(__transfersize(4) int32_t const * ptr
); // VLD1.32 {d0, d1}, [r0]
9207 #define vld1q_s32 LOAD_SI128
9209 int64x2_t
vld1q_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
9210 #define vld1q_s64 LOAD_SI128
9212 float16x8_t
vld1q_f16(__transfersize(8) __fp16
const * ptr
); // VLD1.16 {d0, d1}, [r0]
9213 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
9214 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
9215 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9217 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
9220 float32x4_t
vld1q_f32(__transfersize(4) float32_t
const * ptr
); // VLD1.32 {d0, d1}, [r0]
9221 _NEON2SSE_INLINE float32x4_t
vld1q_f32(__transfersize(4) float32_t
const * ptr
)
9223 if( (((unsigned long)(ptr
)) & 15 ) == 0 ) //16 bits aligned
9224 return _mm_load_ps(ptr
);
9226 return _mm_loadu_ps(ptr
);
9229 poly8x16_t
vld1q_p8(__transfersize(16) poly8_t
const * ptr
); // VLD1.8 {d0, d1}, [r0]
9230 #define vld1q_p8 LOAD_SI128
9232 poly16x8_t
vld1q_p16(__transfersize(8) poly16_t
const * ptr
); // VLD1.16 {d0, d1}, [r0]
9233 #define vld1q_p16 LOAD_SI128
9235 uint8x8_t
vld1_u8(__transfersize(8) uint8_t const * ptr
); // VLD1.8 {d0}, [r0]
9236 #define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
9238 uint16x4_t
vld1_u16(__transfersize(4) uint16_t const * ptr
); // VLD1.16 {d0}, [r0]
9239 #define vld1_u16 vld1_u8
9241 uint32x2_t
vld1_u32(__transfersize(2) uint32_t const * ptr
); // VLD1.32 {d0}, [r0]
9242 #define vld1_u32 vld1_u8
9245 uint64x1_t
vld1_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
9246 #define vld1_u64 vld1_u8
9248 int8x8_t
vld1_s8(__transfersize(8) int8_t const * ptr
); // VLD1.8 {d0}, [r0]
9249 #define vld1_s8 vld1_u8
9251 int16x4_t
vld1_s16(__transfersize(4) int16_t const * ptr
); // VLD1.16 {d0}, [r0]
9252 #define vld1_s16 vld1_u16
9254 int32x2_t
vld1_s32(__transfersize(2) int32_t const * ptr
); // VLD1.32 {d0}, [r0]
9255 #define vld1_s32 vld1_u32
9257 int64x1_t
vld1_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
9258 #define vld1_s64 vld1_u64
9260 float16x4_t
vld1_f16(__transfersize(4) __fp16
const * ptr
); // VLD1.16 {d0}, [r0]
9261 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
9263 float32x2_t
vld1_f32(__transfersize(2) float32_t
const * ptr
); // VLD1.32 {d0}, [r0]
9264 _NEON2SSE_INLINE float32x2_t
vld1_f32(__transfersize(2) float32_t
const * ptr
)
9267 res
.m64_f32
[0] = *(ptr
);
9268 res
.m64_f32
[1] = *(ptr
+ 1);
9272 poly8x8_t
vld1_p8(__transfersize(8) poly8_t
const * ptr
); // VLD1.8 {d0}, [r0]
9273 #define vld1_p8 vld1_u8
9275 poly16x4_t
vld1_p16(__transfersize(4) poly16_t
const * ptr
); // VLD1.16 {d0}, [r0]
9276 #define vld1_p16 vld1_u16
9278 //***********************************************************************************************************
9279 //******* Lane load functions - insert the data at vector's given position (lane) *************************
9280 //***********************************************************************************************************
9281 uint8x16_t
vld1q_lane_u8(__transfersize(1) uint8_t const * ptr
, uint8x16_t vec
, __constrange(0,15) int lane
); // VLD1.8 {d0[0]}, [r0]
9282 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9284 uint16x8_t
vld1q_lane_u16(__transfersize(1) uint16_t const * ptr
, uint16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
9285 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9287 uint32x4_t
vld1q_lane_u32(__transfersize(1) uint32_t const * ptr
, uint32x4_t vec
, __constrange(0,3) int lane
); // VLD1.32 {d0[0]}, [r0]
9288 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9290 uint64x2_t
vld1q_lane_u64(__transfersize(1) uint64_t const * ptr
, uint64x2_t vec
, __constrange(0,1) int lane
); // VLD1.64 {d0}, [r0]
9291 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
9294 int8x16_t
vld1q_lane_s8(__transfersize(1) int8_t const * ptr
, int8x16_t vec
, __constrange(0,15) int lane
); // VLD1.8 {d0[0]}, [r0]
9295 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9297 int16x8_t
vld1q_lane_s16(__transfersize(1) int16_t const * ptr
, int16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
9298 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9300 int32x4_t
vld1q_lane_s32(__transfersize(1) int32_t const * ptr
, int32x4_t vec
, __constrange(0,3) int lane
); // VLD1.32 {d0[0]}, [r0]
9301 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
9303 float16x8_t
vld1q_lane_f16(__transfersize(1) __fp16
const * ptr
, float16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
9304 //current IA SIMD doesn't support float16
9306 float32x4_t
vld1q_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x4_t vec
, __constrange(0,3) int lane
); // VLD1.32 {d0[0]}, [r0]
9307 _NEON2SSE_INLINE float32x4_t
vld1q_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x4_t vec
, __constrange(0,3) int lane
)
9309 //we need to deal with ptr 16bit NOT aligned case
9311 p
= _mm_set1_ps(*(ptr
));
9312 return _MM_INSERT_PS(vec
, p
, _INSERTPS_NDX(0, lane
));
9315 int64x2_t
vld1q_lane_s64(__transfersize(1) int64_t const * ptr
, int64x2_t vec
, __constrange(0,1) int lane
); // VLD1.64 {d0}, [r0]
9316 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
9318 poly8x16_t
vld1q_lane_p8(__transfersize(1) poly8_t
const * ptr
, poly8x16_t vec
, __constrange(0,15) int lane
); // VLD1.8 {d0[0]}, [r0]
9319 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
9321 poly16x8_t
vld1q_lane_p16(__transfersize(1) poly16_t
const * ptr
, poly16x8_t vec
, __constrange(0,7) int lane
); // VLD1.16 {d0[0]}, [r0]
9322 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
9324 uint8x8_t
vld1_lane_u8(__transfersize(1) uint8_t const * ptr
, uint8x8_t vec
, __constrange(0,7) int lane
); // VLD1.8 {d0[0]}, [r0]
9325 _NEON2SSE_INLINE uint8x8_t
vld1_lane_u8(__transfersize(1) uint8_t const * ptr
, uint8x8_t vec
, __constrange(0,7) int lane
)
9329 res
.m64_u8
[lane
] = *(ptr
);
9333 uint16x4_t
vld1_lane_u16(__transfersize(1) uint16_t const * ptr
, uint16x4_t vec
, __constrange(0,3) int lane
); // VLD1.16 {d0[0]}, [r0]
9334 _NEON2SSE_INLINE uint16x4_t
vld1_lane_u16(__transfersize(1) uint16_t const * ptr
, uint16x4_t vec
, __constrange(0,3) int lane
)
9338 res
.m64_u16
[lane
] = *(ptr
);
9342 uint32x2_t
vld1_lane_u32(__transfersize(1) uint32_t const * ptr
, uint32x2_t vec
, __constrange(0,1) int lane
); // VLD1.32 {d0[0]}, [r0]
9343 _NEON2SSE_INLINE uint32x2_t
vld1_lane_u32(__transfersize(1) uint32_t const * ptr
, uint32x2_t vec
, __constrange(0,1) int lane
)
9347 res
.m64_u32
[lane
] = *(ptr
);
9351 uint64x1_t
vld1_lane_u64(__transfersize(1) uint64_t const * ptr
, uint64x1_t vec
, __constrange(0,0) int lane
); // VLD1.64 {d0}, [r0]
9352 _NEON2SSE_INLINE uint64x1_t
vld1_lane_u64(__transfersize(1) uint64_t const * ptr
, uint64x1_t vec
, __constrange(0,0) int lane
)
9355 res
.m64_u64
[0] = *(ptr
);
9360 int8x8_t
vld1_lane_s8(__transfersize(1) int8_t const * ptr
, int8x8_t vec
, __constrange(0,7) int lane
); // VLD1.8 {d0[0]}, [r0]
9361 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
9363 int16x4_t
vld1_lane_s16(__transfersize(1) int16_t const * ptr
, int16x4_t vec
, __constrange(0,3) int lane
); // VLD1.16 {d0[0]}, [r0]
9364 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
9366 int32x2_t
vld1_lane_s32(__transfersize(1) int32_t const * ptr
, int32x2_t vec
, __constrange(0,1) int lane
); // VLD1.32 {d0[0]}, [r0]
9367 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
9369 float16x4_t
vld1_lane_f16(__transfersize(1) __fp16
const * ptr
, float16x4_t vec
, __constrange(0,3) int lane
); // VLD1.16 {d0[0]}, [r0]
9370 //current IA SIMD doesn't support float16
9372 float32x2_t
vld1_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x2_t vec
, __constrange(0,1) int lane
); // VLD1.32 {d0[0]}, [r0]
9373 _NEON2SSE_INLINE float32x2_t
vld1_lane_f32(__transfersize(1) float32_t
const * ptr
, float32x2_t vec
, __constrange(0,1) int lane
)
9377 res
.m64_f32
[lane
] = *(ptr
);
9381 int64x1_t
vld1_lane_s64(__transfersize(1) int64_t const * ptr
, int64x1_t vec
, __constrange(0,0) int lane
); // VLD1.64 {d0}, [r0]
9382 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
9384 poly8x8_t
vld1_lane_p8(__transfersize(1) poly8_t
const * ptr
, poly8x8_t vec
, __constrange(0,7) int lane
); // VLD1.8 {d0[0]}, [r0]
9385 #define vld1_lane_p8 vld1_lane_u8
9387 poly16x4_t
vld1_lane_p16(__transfersize(1) poly16_t
const * ptr
, poly16x4_t vec
, __constrange(0,3) int lane
); // VLD1.16 {d0[0]}, [r0]
9388 #define vld1_lane_p16 vld1_lane_s16
9390 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
9391 // ******************************************************************************************************************
9392 uint8x16_t
vld1q_dup_u8(__transfersize(1) uint8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
9393 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
9395 uint16x8_t
vld1q_dup_u16(__transfersize(1) uint16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
9396 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
9398 uint32x4_t
vld1q_dup_u32(__transfersize(1) uint32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
9399 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
9401 uint64x2_t
vld1q_dup_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
9402 _NEON2SSE_INLINE uint64x2_t
vld1q_dup_u64(__transfersize(1) uint64_t const * ptr
)
9404 _NEON2SSE_ALIGN_16
uint64_t val
[2] = {*(ptr
), *(ptr
)};
9405 return LOAD_SI128(val
);
9408 int8x16_t
vld1q_dup_s8(__transfersize(1) int8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
9409 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
9411 int16x8_t
vld1q_dup_s16(__transfersize(1) int16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
9412 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
9414 int32x4_t
vld1q_dup_s32(__transfersize(1) int32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
9415 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
9417 int64x2_t
vld1q_dup_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
9418 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
9420 float16x8_t
vld1q_dup_f16(__transfersize(1) __fp16
const * ptr
); // VLD1.16 {d0[]}, [r0]
9421 //current IA SIMD doesn't support float16, need to go to 32 bits
9423 float32x4_t
vld1q_dup_f32(__transfersize(1) float32_t
const * ptr
); // VLD1.32 {d0[]}, [r0]
9424 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
9426 poly8x16_t
vld1q_dup_p8(__transfersize(1) poly8_t
const * ptr
); // VLD1.8 {d0[]}, [r0]
9427 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
9429 poly16x8_t
vld1q_dup_p16(__transfersize(1) poly16_t
const * ptr
); // VLD1.16 {d0[]}, [r0]
9430 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
9432 uint8x8_t
vld1_dup_u8(__transfersize(1) uint8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
9433 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vld1_dup_u8(__transfersize(1) uint8_t const * ptr
), _NEON2SSE_REASON_SLOW_SERIAL
)
9437 for(i
= 0; i
<8; i
++) {
9438 res
.m64_u8
[i
] = *(ptr
);
9443 uint16x4_t
vld1_dup_u16(__transfersize(1) uint16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
9444 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vld1_dup_u16(__transfersize(1) uint16_t const * ptr
), _NEON2SSE_REASON_SLOW_SERIAL
)
9448 for(i
= 0; i
<4; i
++) {
9449 res
.m64_u16
[i
] = *(ptr
);
9454 uint32x2_t
vld1_dup_u32(__transfersize(1) uint32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
9455 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vld1_dup_u32(__transfersize(1) uint32_t const * ptr
), _NEON2SSE_REASON_SLOW_SERIAL
)
9458 res
.m64_u32
[0] = *(ptr
);
9459 res
.m64_u32
[1] = *(ptr
);
9463 uint64x1_t
vld1_dup_u64(__transfersize(1) uint64_t const * ptr
); // VLD1.64 {d0}, [r0]
9464 _NEON2SSE_INLINE uint64x1_t
vld1_dup_u64(__transfersize(1) uint64_t const * ptr
)
9467 res
.m64_u64
[0] = *(ptr
);
9471 int8x8_t
vld1_dup_s8(__transfersize(1) int8_t const * ptr
); // VLD1.8 {d0[]}, [r0]
9472 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
9475 int16x4_t
vld1_dup_s16(__transfersize(1) int16_t const * ptr
); // VLD1.16 {d0[]}, [r0]
9476 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
9479 int32x2_t
vld1_dup_s32(__transfersize(1) int32_t const * ptr
); // VLD1.32 {d0[]}, [r0]
9480 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
9483 int64x1_t
vld1_dup_s64(__transfersize(1) int64_t const * ptr
); // VLD1.64 {d0}, [r0]
9484 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
9486 float16x4_t
vld1_dup_f16(__transfersize(1) __fp16
const * ptr
); // VLD1.16 {d0[]}, [r0]
9487 //current IA SIMD doesn't support float16
9489 float32x2_t
vld1_dup_f32(__transfersize(1) float32_t
const * ptr
); // VLD1.32 {d0[]}, [r0]
9490 _NEON2SSE_INLINE float32x2_t
vld1_dup_f32(__transfersize(1) float32_t
const * ptr
)
9493 res
.m64_f32
[0] = *(ptr
);
9494 res
.m64_f32
[1] = res
.m64_f32
[0];
9495 return res
; // use last 64bits only
9498 poly8x8_t
vld1_dup_p8(__transfersize(1) poly8_t
const * ptr
); // VLD1.8 {d0[]}, [r0]
9499 #define vld1_dup_p8 vld1_dup_u8
9502 poly16x4_t
vld1_dup_p16(__transfersize(1) poly16_t
const * ptr
); // VLD1.16 {d0[]}, [r0]
9503 #define vld1_dup_p16 vld1_dup_u16
9506 //*************************************************************************************
9507 //********************************* Store **********************************************
9508 //*************************************************************************************
9509 // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
9510 //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
9511 #define STORE_SI128(ptr, val) \
9512 (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
9514 void vst1q_u8(__transfersize(16) uint8_t * ptr
, uint8x16_t val
); // VST1.8 {d0, d1}, [r0]
9515 #define vst1q_u8 STORE_SI128
9517 void vst1q_u16(__transfersize(8) uint16_t * ptr
, uint16x8_t val
); // VST1.16 {d0, d1}, [r0]
9518 #define vst1q_u16 STORE_SI128
9520 void vst1q_u32(__transfersize(4) uint32_t * ptr
, uint32x4_t val
); // VST1.32 {d0, d1}, [r0]
9521 #define vst1q_u32 STORE_SI128
9523 void vst1q_u64(__transfersize(2) uint64_t * ptr
, uint64x2_t val
); // VST1.64 {d0, d1}, [r0]
9524 #define vst1q_u64 STORE_SI128
9526 void vst1q_s8(__transfersize(16) int8_t * ptr
, int8x16_t val
); // VST1.8 {d0, d1}, [r0]
9527 #define vst1q_s8 STORE_SI128
9529 void vst1q_s16(__transfersize(8) int16_t * ptr
, int16x8_t val
); // VST1.16 {d0, d1}, [r0]
9530 #define vst1q_s16 STORE_SI128
9532 void vst1q_s32(__transfersize(4) int32_t * ptr
, int32x4_t val
); // VST1.32 {d0, d1}, [r0]
9533 #define vst1q_s32 STORE_SI128
9535 void vst1q_s64(__transfersize(2) int64_t * ptr
, int64x2_t val
); // VST1.64 {d0, d1}, [r0]
9536 #define vst1q_s64 STORE_SI128
9538 void vst1q_f16(__transfersize(8) __fp16
* ptr
, float16x8_t val
); // VST1.16 {d0, d1}, [r0]
9539 // IA32 SIMD doesn't work with 16bit floats currently
9541 void vst1q_f32(__transfersize(4) float32_t
* ptr
, float32x4_t val
); // VST1.32 {d0, d1}, [r0]
9542 _NEON2SSE_INLINE
void vst1q_f32(__transfersize(4) float32_t
* ptr
, float32x4_t val
)
9544 if( ((unsigned long)(ptr
) & 15) == 0 ) //16 bits aligned
9545 _mm_store_ps (ptr
, val
);
9547 _mm_storeu_ps (ptr
, val
);
9550 void vst1q_p8(__transfersize(16) poly8_t
* ptr
, poly8x16_t val
); // VST1.8 {d0, d1}, [r0]
9551 #define vst1q_p8 vst1q_u8
9553 void vst1q_p16(__transfersize(8) poly16_t
* ptr
, poly16x8_t val
); // VST1.16 {d0, d1}, [r0]
9554 #define vst1q_p16 vst1q_u16
9556 void vst1_u8(__transfersize(8) uint8_t * ptr
, uint8x8_t val
); // VST1.8 {d0}, [r0]
9557 _NEON2SSE_INLINE
void vst1_u8(__transfersize(8) uint8_t * ptr
, uint8x8_t val
)
9560 for (i
= 0; i
<8; i
++) {
9561 *(ptr
+ i
) = ((uint8_t*)&val
)[i
];
9563 //_mm_storel_epi64((__m128i*)ptr, val);
9567 void vst1_u16(__transfersize(4) uint16_t * ptr
, uint16x4_t val
); // VST1.16 {d0}, [r0]
9568 _NEON2SSE_INLINE
void vst1_u16(__transfersize(4) uint16_t * ptr
, uint16x4_t val
)
9571 for (i
= 0; i
<4; i
++) {
9572 *(ptr
+ i
) = ((uint16_t*)&val
)[i
];
9574 //_mm_storel_epi64((__m128i*)ptr, val);
9578 void vst1_u32(__transfersize(2) uint32_t * ptr
, uint32x2_t val
); // VST1.32 {d0}, [r0]
9579 _NEON2SSE_INLINE
void vst1_u32(__transfersize(2) uint32_t * ptr
, uint32x2_t val
)
9582 for (i
= 0; i
<2; i
++) {
9583 *(ptr
+ i
) = ((uint32_t*)&val
)[i
];
9585 //_mm_storel_epi64((__m128i*)ptr, val);
9589 void vst1_u64(__transfersize(1) uint64_t * ptr
, uint64x1_t val
); // VST1.64 {d0}, [r0]
9590 _NEON2SSE_INLINE
void vst1_u64(__transfersize(1) uint64_t * ptr
, uint64x1_t val
)
9592 *(ptr
) = *((uint64_t*)&val
);
9593 //_mm_storel_epi64((__m128i*)ptr, val);
9597 void vst1_s8(__transfersize(8) int8_t * ptr
, int8x8_t val
); // VST1.8 {d0}, [r0]
9598 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
9600 void vst1_s16(__transfersize(4) int16_t * ptr
, int16x4_t val
); // VST1.16 {d0}, [r0]
9601 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
9603 void vst1_s32(__transfersize(2) int32_t * ptr
, int32x2_t val
); // VST1.32 {d0}, [r0]
9604 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
9606 void vst1_s64(__transfersize(1) int64_t * ptr
, int64x1_t val
); // VST1.64 {d0}, [r0]
9607 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
9609 void vst1_f16(__transfersize(4) __fp16
* ptr
, float16x4_t val
); // VST1.16 {d0}, [r0]
9610 //current IA SIMD doesn't support float16
9612 void vst1_f32(__transfersize(2) float32_t
* ptr
, float32x2_t val
); // VST1.32 {d0}, [r0]
9613 _NEON2SSE_INLINE
void vst1_f32(__transfersize(2) float32_t
* ptr
, float32x2_t val
)
9615 *(ptr
) = val
.m64_f32
[0];
9616 *(ptr
+ 1) = val
.m64_f32
[1];
9620 void vst1_p8(__transfersize(8) poly8_t
* ptr
, poly8x8_t val
); // VST1.8 {d0}, [r0]
9621 #define vst1_p8 vst1_u8
9623 void vst1_p16(__transfersize(4) poly16_t
* ptr
, poly16x4_t val
); // VST1.16 {d0}, [r0]
9624 #define vst1_p16 vst1_u16
9626 //***********Store a lane of a vector into memory (extract given lane) *********************
9627 //******************************************************************************************
9628 void vst1q_lane_u8(__transfersize(1) uint8_t * ptr
, uint8x16_t val
, __constrange(0,15) int lane
); // VST1.8 {d0[0]}, [r0]
9629 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
9631 void vst1q_lane_u16(__transfersize(1) uint16_t * ptr
, uint16x8_t val
, __constrange(0,7) int lane
); // VST1.16 {d0[0]}, [r0]
9632 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
9634 void vst1q_lane_u32(__transfersize(1) uint32_t * ptr
, uint32x4_t val
, __constrange(0,3) int lane
); // VST1.32 {d0[0]}, [r0]
9635 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9637 void vst1q_lane_u64(__transfersize(1) uint64_t * ptr
, uint64x2_t val
, __constrange(0,1) int lane
); // VST1.64 {d0}, [r0]
9638 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9640 void vst1q_lane_s8(__transfersize(1) int8_t * ptr
, int8x16_t val
, __constrange(0,15) int lane
); // VST1.8 {d0[0]}, [r0]
9641 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
9643 void vst1q_lane_s16(__transfersize(1) int16_t * ptr
, int16x8_t val
, __constrange(0,7) int lane
); // VST1.16 {d0[0]}, [r0]
9644 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
9646 void vst1q_lane_s32(__transfersize(1) int32_t * ptr
, int32x4_t val
, __constrange(0,3) int lane
); // VST1.32 {d0[0]}, [r0]
9647 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
9649 void vst1q_lane_s64(__transfersize(1) int64_t * ptr
, int64x2_t val
, __constrange(0,1) int lane
); // VST1.64 {d0}, [r0]
9650 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
9652 void vst1q_lane_f16(__transfersize(1) __fp16
* ptr
, float16x8_t val
, __constrange(0,7) int lane
); // VST1.16 {d0[0]}, [r0]
9653 //current IA SIMD doesn't support float16
9655 void vst1q_lane_f32(__transfersize(1) float32_t
* ptr
, float32x4_t val
, __constrange(0,3) int lane
); // VST1.32 {d0[0]}, [r0]
9656 _NEON2SSE_INLINE
void vst1q_lane_f32(__transfersize(1) float32_t
* ptr
, float32x4_t val
, __constrange(0,3) int lane
)
9659 ilane
= _MM_EXTRACT_PS(val
,lane
);
9660 *(ptr
) = *((float*)&ilane
);
9663 void vst1q_lane_p8(__transfersize(1) poly8_t
* ptr
, poly8x16_t val
, __constrange(0,15) int lane
); // VST1.8 {d0[0]}, [r0]
9664 #define vst1q_lane_p8 vst1q_lane_u8
9666 void vst1q_lane_p16(__transfersize(1) poly16_t
* ptr
, poly16x8_t val
, __constrange(0,7) int lane
); // VST1.16 {d0[0]}, [r0]
9667 #define vst1q_lane_p16 vst1q_lane_s16
9669 void vst1_lane_u8(__transfersize(1) uint8_t * ptr
, uint8x8_t val
, __constrange(0,7) int lane
); // VST1.8 {d0[0]}, [r0]
9670 _NEON2SSE_INLINE
void vst1_lane_u8(__transfersize(1) uint8_t * ptr
, uint8x8_t val
, __constrange(0,7) int lane
)
9672 *(ptr
) = val
.m64_u8
[lane
];
9675 void vst1_lane_u16(__transfersize(1) uint16_t * ptr
, uint16x4_t val
, __constrange(0,3) int lane
); // VST1.16 {d0[0]}, [r0]
9676 _NEON2SSE_INLINE
void vst1_lane_u16(__transfersize(1) uint16_t * ptr
, uint16x4_t val
, __constrange(0,3) int lane
)
9678 *(ptr
) = val
.m64_u16
[lane
];
9681 void vst1_lane_u32(__transfersize(1) uint32_t * ptr
, uint32x2_t val
, __constrange(0,1) int lane
); // VST1.32 {d0[0]}, [r0]
9682 _NEON2SSE_INLINE
void vst1_lane_u32(__transfersize(1) uint32_t * ptr
, uint32x2_t val
, __constrange(0,1) int lane
)
9684 *(ptr
) = val
.m64_u32
[lane
];
9687 void vst1_lane_u64(__transfersize(1) uint64_t * ptr
, uint64x1_t val
, __constrange(0,0) int lane
); // VST1.64 {d0}, [r0]
9688 _NEON2SSE_INLINE
void vst1_lane_u64(__transfersize(1) uint64_t * ptr
, uint64x1_t val
, __constrange(0,0) int lane
)
9690 *(ptr
) = val
.m64_u64
[0];
9693 void vst1_lane_s8(__transfersize(1) int8_t * ptr
, int8x8_t val
, __constrange(0,7) int lane
); // VST1.8 {d0[0]}, [r0]
9694 #define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
9696 void vst1_lane_s16(__transfersize(1) int16_t * ptr
, int16x4_t val
, __constrange(0,3) int lane
); // VST1.16 {d0[0]}, [r0]
9697 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
9699 void vst1_lane_s32(__transfersize(1) int32_t * ptr
, int32x2_t val
, __constrange(0,1) int lane
); // VST1.32 {d0[0]}, [r0]
9700 #define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane)
9703 void vst1_lane_s64(__transfersize(1) int64_t * ptr
, int64x1_t val
, __constrange(0,0) int lane
); // VST1.64 {d0}, [r0]
9704 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
9707 void vst1_lane_f16(__transfersize(1) __fp16
* ptr
, float16x4_t val
, __constrange(0,3) int lane
); // VST1.16 {d0[0]}, [r0]
9708 //current IA SIMD doesn't support float16
9710 void vst1_lane_f32(__transfersize(1) float32_t
* ptr
, float32x2_t val
, __constrange(0,1) int lane
); // VST1.32 {d0[0]}, [r0]
9711 _NEON2SSE_INLINE
void vst1_lane_f32(__transfersize(1) float32_t
* ptr
, float32x2_t val
, __constrange(0,1) int lane
)
9713 *(ptr
) = val
.m64_f32
[lane
];
9716 void vst1_lane_p8(__transfersize(1) poly8_t
* ptr
, poly8x8_t val
, __constrange(0,7) int lane
); // VST1.8 {d0[0]}, [r0]
9717 #define vst1_lane_p8 vst1_lane_u8
9719 void vst1_lane_p16(__transfersize(1) poly16_t
* ptr
, poly16x4_t val
, __constrange(0,3) int lane
); // VST1.16 {d0[0]}, [r0]
9720 #define vst1_lane_p16 vst1_lane_s16
9722 //***********************************************************************************************
9723 //**************** Loads and stores of an N-element structure **********************************
9724 //***********************************************************************************************
9725 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
9726 //We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions"
9727 //****************** 2 elements load *********************************************
9728 uint8x16x2_t
vld2q_u8(__transfersize(32) uint8_t const * ptr
); // VLD2.8 {d0, d2}, [r0]
9729 _NEON2SSE_INLINE uint8x16x2_t
vld2q_u8(__transfersize(32) uint8_t const * ptr
) // VLD2.8 {d0, d2}, [r0]
9732 v
.val
[0] = vld1q_u8(ptr
);
9733 v
.val
[1] = vld1q_u8((ptr
+ 16));
9734 v
= vuzpq_s8(v
.val
[0], v
.val
[1]);
9738 uint16x8x2_t
vld2q_u16(__transfersize(16) uint16_t const * ptr
); // VLD2.16 {d0, d2}, [r0]
9739 _NEON2SSE_INLINE uint16x8x2_t
vld2q_u16(__transfersize(16) uint16_t const * ptr
) // VLD2.16 {d0, d2}, [r0]
9742 v
.val
[0] = vld1q_u16( ptr
);
9743 v
.val
[1] = vld1q_u16( (ptr
+ 8));
9744 v
= vuzpq_s16(v
.val
[0], v
.val
[1]);
9748 uint32x4x2_t
vld2q_u32(__transfersize(8) uint32_t const * ptr
); // VLD2.32 {d0, d2}, [r0]
9749 _NEON2SSE_INLINE uint32x4x2_t
vld2q_u32(__transfersize(8) uint32_t const * ptr
) // VLD2.32 {d0, d2}, [r0]
9752 v
.val
[0] = vld1q_u32 ( ptr
);
9753 v
.val
[1] = vld1q_u32 ( (ptr
+ 4));
9754 v
= vuzpq_s32(v
.val
[0], v
.val
[1]);
9758 int8x16x2_t
vld2q_s8(__transfersize(32) int8_t const * ptr
);
9759 #define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
9761 int16x8x2_t
vld2q_s16(__transfersize(16) int16_t const * ptr
); // VLD2.16 {d0, d2}, [r0]
9762 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
9764 int32x4x2_t
vld2q_s32(__transfersize(8) int32_t const * ptr
); // VLD2.32 {d0, d2}, [r0]
9765 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
9768 float16x8x2_t
vld2q_f16(__transfersize(16) __fp16
const * ptr
); // VLD2.16 {d0, d2}, [r0]
9769 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9771 float32x4x2_t
vld2q_f32(__transfersize(8) float32_t
const * ptr
); // VLD2.32 {d0, d2}, [r0]
9772 _NEON2SSE_INLINE float32x4x2_t
vld2q_f32(__transfersize(8) float32_t
const * ptr
) // VLD2.32 {d0, d2}, [r0]
9775 v
.val
[0] = vld1q_f32 (ptr
);
9776 v
.val
[1] = vld1q_f32 ((ptr
+ 4));
9777 v
= vuzpq_f32(v
.val
[0], v
.val
[1]);
9781 poly8x16x2_t
vld2q_p8(__transfersize(32) poly8_t
const * ptr
); // VLD2.8 {d0, d2}, [r0]
9782 #define vld2q_p8 vld2q_u8
9784 poly16x8x2_t
vld2q_p16(__transfersize(16) poly16_t
const * ptr
); // VLD2.16 {d0, d2}, [r0]
9785 #define vld2q_p16 vld2q_u16
9787 uint8x8x2_t
vld2_u8(__transfersize(16) uint8_t const * ptr
); // VLD2.8 {d0, d1}, [r0]
9788 _NEON2SSE_INLINE uint8x8x2_t
vld2_u8(__transfersize(16) uint8_t const * ptr
)
9791 _NEON2SSE_ALIGN_16
int8_t mask8_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
9793 ld128
= vld1q_u8(ptr
); //merge two 64-bits in 128 bit
9794 ld128
= _mm_shuffle_epi8(ld128
, *(__m128i
*)mask8_even_odd
);
9795 vst1q_u8((v
.val
), ld128
); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
9799 uint16x4x2_t
vld2_u16(__transfersize(8) uint16_t const * ptr
); // VLD2.16 {d0, d1}, [r0]
9800 _NEON2SSE_INLINE uint16x4x2_t
vld2_u16(__transfersize(8) uint16_t const * ptr
)
9802 _NEON2SSE_ALIGN_16 uint16x4x2_t v
;
9803 _NEON2SSE_ALIGN_16
int8_t mask16_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
9805 ld128
= vld1q_u16(ptr
); //merge two 64-bits in 128 bit
9806 ld128
= _mm_shuffle_epi8(ld128
, *(__m128i
*)mask16_even_odd
);
9807 vst1q_u16((v
.val
), ld128
);
9811 uint32x2x2_t
vld2_u32(__transfersize(4) uint32_t const * ptr
); // VLD2.32 {d0, d1}, [r0]
9812 _NEON2SSE_INLINE uint32x2x2_t
vld2_u32(__transfersize(4) uint32_t const * ptr
)
9814 _NEON2SSE_ALIGN_16 uint32x2x2_t v
;
9816 ld128
= vld1q_u32(ptr
); //merge two 64-bits in 128 bit
9817 ld128
= _mm_shuffle_epi32(ld128
, 0 | (2 << 2) | (1 << 4) | (3 << 6));
9818 vst1q_u32((v
.val
), ld128
);
9822 uint64x1x2_t
vld2_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
9823 _NEON2SSE_INLINE uint64x1x2_t
vld2_u64(__transfersize(2) uint64_t const * ptr
)
9826 v
.val
[0].m64_u64
[0] = *(ptr
);
9827 v
.val
[1].m64_u64
[0] = *(ptr
+ 1);
9831 int8x8x2_t
vld2_s8(__transfersize(16) int8_t const * ptr
); // VLD2.8 {d0, d1}, [r0]
9832 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
9834 int16x4x2_t
vld2_s16(__transfersize(8) int16_t const * ptr
); // VLD2.16 {d0, d1}, [r0]
9835 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
9837 int32x2x2_t
vld2_s32(__transfersize(4) int32_t const * ptr
); // VLD2.32 {d0, d1}, [r0]
9838 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
9840 int64x1x2_t
vld2_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
9841 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
9843 float16x4x2_t
vld2_f16(__transfersize(8) __fp16
const * ptr
); // VLD2.16 {d0, d1}, [r0]
9844 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
9846 float32x2x2_t
vld2_f32(__transfersize(4) float32_t
const * ptr
); // VLD2.32 {d0, d1}, [r0]
9847 _NEON2SSE_INLINE float32x2x2_t
vld2_f32(__transfersize(4) float32_t
const * ptr
)
9850 v
.val
[0].m64_f32
[0] = *(ptr
);
9851 v
.val
[0].m64_f32
[1] = *(ptr
+ 2);
9852 v
.val
[1].m64_f32
[0] = *(ptr
+ 1);
9853 v
.val
[1].m64_f32
[1] = *(ptr
+ 3);
9857 poly8x8x2_t
vld2_p8(__transfersize(16) poly8_t
const * ptr
); // VLD2.8 {d0, d1}, [r0]
9858 #define vld2_p8 vld2_u8
9860 poly16x4x2_t
vld2_p16(__transfersize(8) poly16_t
const * ptr
); // VLD2.16 {d0, d1}, [r0]
9861 #define vld2_p16 vld2_u16
9863 //******************** Triplets ***************************************
9864 //*********************************************************************
9865 uint8x16x3_t
vld3q_u8(__transfersize(48) uint8_t const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
9866 _NEON2SSE_INLINE uint8x16x3_t
vld3q_u8(__transfersize(48) uint8_t const * ptr
) // VLD3.8 {d0, d2, d4}, [r0]
9868 //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
9869 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13
9870 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14,
9871 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15
9873 __m128i tmp0
, tmp1
,tmp2
, tmp3
;
9874 _NEON2SSE_ALIGN_16
int8_t mask8_0
[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
9875 _NEON2SSE_ALIGN_16
int8_t mask8_1
[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
9876 _NEON2SSE_ALIGN_16
int8_t mask8_2
[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
9878 v
.val
[0] = vld1q_u8 (ptr
); //a0,a1,a2,a3,...a7, ...a15
9879 v
.val
[1] = vld1q_u8 ((ptr
+ 16)); //b0,b1,b2,b3...b7, ...b15
9880 v
.val
[2] = vld1q_u8 ((ptr
+ 32)); //c0,c1,c2,c3,...c7,...c15
9882 tmp0
= _mm_shuffle_epi8(v
.val
[0], *(__m128i
*)mask8_0
); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
9883 tmp1
= _mm_shuffle_epi8(v
.val
[1], *(__m128i
*)mask8_1
); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
9884 tmp2
= _mm_shuffle_epi8(v
.val
[2], *(__m128i
*)mask8_2
); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
9886 tmp3
= _mm_slli_si128(tmp0
,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
9887 tmp3
= _mm_alignr_epi8(tmp1
,tmp3
, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
9888 tmp3
= _mm_slli_si128(tmp3
, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
9889 tmp3
= _mm_srli_si128(tmp3
, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
9890 v
.val
[0] = _mm_slli_si128(tmp2
, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
9891 v
.val
[0] = _mm_or_si128(v
.val
[0],tmp3
); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
9893 tmp3
= _mm_slli_si128(tmp0
, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
9894 tmp3
= _mm_srli_si128(tmp3
, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
9895 v
.val
[1] = _mm_srli_si128(tmp1
,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
9896 v
.val
[1] = _mm_slli_si128(v
.val
[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
9897 v
.val
[1] = _mm_or_si128(v
.val
[1],tmp3
); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
9898 v
.val
[1] = _mm_slli_si128(v
.val
[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
9899 v
.val
[1] = _mm_srli_si128(v
.val
[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
9900 tmp3
= _mm_srli_si128(tmp2
,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
9901 tmp3
= _mm_slli_si128(tmp3
,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
9902 v
.val
[1] = _mm_or_si128(v
.val
[1],tmp3
); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
9904 tmp3
= _mm_srli_si128(tmp2
,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
9905 tmp3
= _mm_slli_si128(tmp3
,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
9906 v
.val
[2] = _mm_srli_si128(tmp1
,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
9907 v
.val
[2] = _mm_slli_si128(v
.val
[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
9908 v
.val
[2] = _mm_or_si128(v
.val
[2],tmp3
); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
9909 tmp0
= _mm_srli_si128(tmp0
, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
9910 v
.val
[2] = _mm_or_si128(v
.val
[2],tmp0
); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
9914 uint16x8x3_t
vld3q_u16(__transfersize(24) uint16_t const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
9915 _NEON2SSE_INLINE uint16x8x3_t
vld3q_u16(__transfersize(24) uint16_t const * ptr
) // VLD3.16 {d0, d2, d4}, [r0]
9917 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
9919 __m128i tmp0
, tmp1
,tmp2
, tmp3
;
9920 _NEON2SSE_ALIGN_16
int8_t mask16_0
[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
9921 _NEON2SSE_ALIGN_16
int8_t mask16_1
[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
9922 _NEON2SSE_ALIGN_16
int8_t mask16_2
[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
9924 v
.val
[0] = vld1q_u16 (ptr
); //a0,a1,a2,a3,...a7,
9925 v
.val
[1] = vld1q_u16 ((ptr
+ 8)); //b0,b1,b2,b3...b7
9926 v
.val
[2] = vld1q_u16 ((ptr
+ 16)); //c0,c1,c2,c3,...c7
9928 tmp0
= _mm_shuffle_epi8(v
.val
[0], *(__m128i
*)mask16_0
); //a0,a3,a6,a1,a4,a7,a2,a5,
9929 tmp1
= _mm_shuffle_epi8(v
.val
[1], *(__m128i
*)mask16_1
); //b1,b4,b7,b2,b5,b0,b3,b6
9930 tmp2
= _mm_shuffle_epi8(v
.val
[2], *(__m128i
*)mask16_2
); //c2,c5, c0,c3,c6, c1,c4,c7
9932 tmp3
= _mm_slli_si128(tmp0
,10); //0,0,0,0,0,a0,a3,a6,
9933 tmp3
= _mm_alignr_epi8(tmp1
,tmp3
, 10); //a0,a3,a6,b1,b4,b7,x,x
9934 tmp3
= _mm_slli_si128(tmp3
, 4); //0,0, a0,a3,a6,b1,b4,b7
9935 tmp3
= _mm_srli_si128(tmp3
, 4); //a0,a3,a6,b1,b4,b7,0,0
9936 v
.val
[0] = _mm_slli_si128(tmp2
, 12); //0,0,0,0,0,0, c2,c5,
9937 v
.val
[0] = _mm_or_si128(v
.val
[0],tmp3
); //a0,a3,a6,b1,b4,b7,c2,c5
9939 tmp3
= _mm_slli_si128(tmp0
, 4); //0,0,a0,a3,a6,a1,a4,a7
9940 tmp3
= _mm_srli_si128(tmp3
,10); //a1,a4,a7, 0,0,0,0,0
9941 v
.val
[1] = _mm_srli_si128(tmp1
,6); //b2,b5,b0,b3,b6,0,0
9942 v
.val
[1] = _mm_slli_si128(v
.val
[1], 6); //0,0,0,b2,b5,b0,b3,b6,
9943 v
.val
[1] = _mm_or_si128(v
.val
[1],tmp3
); //a1,a4,a7,b2,b5,b0,b3,b6,
9944 v
.val
[1] = _mm_slli_si128(v
.val
[1],6); //0,0,0,a1,a4,a7,b2,b5,
9945 v
.val
[1] = _mm_srli_si128(v
.val
[1], 6); //a1,a4,a7,b2,b5,0,0,0,
9946 tmp3
= _mm_srli_si128(tmp2
,4); //c0,c3,c6, c1,c4,c7,0,0
9947 tmp3
= _mm_slli_si128(tmp3
,10); //0,0,0,0,0,c0,c3,c6,
9948 v
.val
[1] = _mm_or_si128(v
.val
[1],tmp3
); //a1,a4,a7,b2,b5,c0,c3,c6,
9950 tmp3
= _mm_srli_si128(tmp2
,10); //c1,c4,c7, 0,0,0,0,0
9951 tmp3
= _mm_slli_si128(tmp3
,10); //0,0,0,0,0, c1,c4,c7,
9952 v
.val
[2] = _mm_srli_si128(tmp1
,10); //b0,b3,b6,0,0, 0,0,0
9953 v
.val
[2] = _mm_slli_si128(v
.val
[2],4); //0,0, b0,b3,b6,0,0,0
9954 v
.val
[2] = _mm_or_si128(v
.val
[2],tmp3
); //0,0, b0,b3,b6,c1,c4,c7,
9955 tmp0
= _mm_srli_si128(tmp0
, 12); //a2,a5,0,0,0,0,0,0
9956 v
.val
[2] = _mm_or_si128(v
.val
[2],tmp0
); //a2,a5,b0,b3,b6,c1,c4,c7,
9960 uint32x4x3_t
vld3q_u32(__transfersize(12) uint32_t const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
9961 _NEON2SSE_INLINE uint32x4x3_t
vld3q_u32(__transfersize(12) uint32_t const * ptr
) // VLD3.32 {d0, d2, d4}, [r0]
9963 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
9965 __m128i tmp0
, tmp1
,tmp2
, tmp3
;
9966 v
.val
[0] = vld1q_u32 (ptr
); //a0,a1,a2,a3,
9967 v
.val
[1] = vld1q_u32 ((ptr
+ 4)); //b0,b1,b2,b3
9968 v
.val
[2] = vld1q_u32 ((ptr
+ 8)); //c0,c1,c2,c3,
9970 tmp0
= _mm_shuffle_epi32(v
.val
[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
9971 tmp1
= _mm_shuffle_epi32(v
.val
[1], _SWAP_HI_LOW32
); //b2,b3,b0,b1
9972 tmp2
= _mm_shuffle_epi32(v
.val
[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
9974 tmp3
= _mm_unpacklo_epi32(tmp1
, tmp2
); //b2,c1, b3,c2
9975 v
.val
[0] = _mm_unpacklo_epi64(tmp0
,tmp3
); //a0,a3,b2,c1
9976 tmp0
= _mm_unpackhi_epi32(tmp0
, tmp1
); //a1,b0, a2,b1
9977 v
.val
[1] = _mm_shuffle_epi32(tmp0
, _SWAP_HI_LOW32
); //a2,b1, a1,b0,
9978 v
.val
[1] = _mm_unpackhi_epi64(v
.val
[1], tmp3
); //a1,b0, b3,c2
9979 v
.val
[2] = _mm_unpackhi_epi64(tmp0
, tmp2
); //a2,b1, c0,c3
9983 int8x16x3_t
vld3q_s8(__transfersize(48) int8_t const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
9984 #define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
9986 int16x8x3_t
vld3q_s16(__transfersize(24) int16_t const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
9987 #define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
9989 int32x4x3_t
vld3q_s32(__transfersize(12) int32_t const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
9990 #define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
9992 float16x8x3_t
vld3q_f16(__transfersize(24) __fp16
const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
9993 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
9995 float32x4x3_t
vld3q_f32(__transfersize(12) float32_t
const * ptr
); // VLD3.32 {d0, d2, d4}, [r0]
9996 _NEON2SSE_INLINE float32x4x3_t
vld3q_f32(__transfersize(12) float32_t
const * ptr
) // VLD3.32 {d0, d2, d4}, [r0]
9998 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10000 __m128 tmp0
, tmp1
,tmp2
, tmp3
;
10001 v
.val
[0] = vld1q_f32 (ptr
); //a0,a1,a2,a3,
10002 v
.val
[1] = vld1q_f32 ((ptr
+ 4)); //b0,b1,b2,b3
10003 v
.val
[2] = vld1q_f32 ((ptr
+ 8)); //c0,c1,c2,c3,
10005 tmp0
= _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v
.val
[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
10006 tmp1
= _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v
.val
[1]), _SWAP_HI_LOW32
)); //b2,b3,b0,b1
10007 tmp2
= _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v
.val
[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
10008 tmp3
= _mm_unpacklo_ps(tmp1
, tmp2
); //b2,c1, b3,c2
10010 v
.val
[0] = _mm_movelh_ps(tmp0
,tmp3
); //a0,a3,b2,c1
10011 tmp0
= _mm_unpackhi_ps(tmp0
, tmp1
); //a1,b0, a2,b1
10012 v
.val
[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0
), _SWAP_HI_LOW32
)); //a2,b1, a1,b0,
10013 v
.val
[1] = _mm_movehl_ps(tmp3
,v
.val
[1]); //a1,b0, b3,c2
10014 v
.val
[2] = _mm_movehl_ps(tmp2
,tmp0
); //a2,b1, c0,c3
10018 poly8x16x3_t
vld3q_p8(__transfersize(48) poly8_t
const * ptr
); // VLD3.8 {d0, d2, d4}, [r0]
10019 #define vld3q_p8 vld3q_u8
10021 poly16x8x3_t
vld3q_p16(__transfersize(24) poly16_t
const * ptr
); // VLD3.16 {d0, d2, d4}, [r0]
10022 #define vld3q_p16 vld3q_u16
10024 uint8x8x3_t
vld3_u8(__transfersize(24) uint8_t const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
10025 _NEON2SSE_INLINE uint8x8x3_t
vld3_u8(__transfersize(24) uint8_t const * ptr
) // VLD3.8 {d0, d1, d2}, [r0]
10027 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
10029 __m128i val0
, val1
, val2
, tmp0
, tmp1
;
10030 _NEON2SSE_ALIGN_16
int8_t mask8_0
[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
10031 _NEON2SSE_ALIGN_16
int8_t mask8_1
[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
10032 val0
= vld1q_u8 (ptr
); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
10033 val2
= _mm_loadl_epi64((__m128i
*)(ptr
+ 16)); //c0,c1,c2,c3,...c7
10035 tmp0
= _mm_shuffle_epi8(val0
, *(__m128i
*)mask8_0
); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
10036 tmp1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask8_1
); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
10037 val0
= _mm_slli_si128(tmp0
,10);
10038 val0
= _mm_srli_si128(val0
,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
10039 val2
= _mm_slli_si128(tmp1
,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
10040 val0
= _mm_or_si128(val0
,val2
); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
10041 _M64(v
.val
[0], val0
);
10042 val1
= _mm_slli_si128(tmp0
,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
10043 val1
= _mm_srli_si128(val1
,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
10044 val2
= _mm_srli_si128(tmp1
,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
10045 val2
= _mm_slli_si128(val2
,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
10046 val1
= _mm_or_si128(val1
,val2
); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
10047 _M64(v
.val
[1], val1
);
10049 tmp0
= _mm_srli_si128(tmp0
,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
10050 val2
= _mm_srli_si128(tmp1
,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
10051 val2
= _mm_slli_si128(val2
,5); //0,0,0,0,0,c1,c4,c7,
10052 val2
= _mm_or_si128(tmp0
, val2
); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
10053 _M64(v
.val
[2], val2
);
10057 uint16x4x3_t
vld3_u16(__transfersize(12) uint16_t const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
10058 _NEON2SSE_INLINE uint16x4x3_t
vld3_u16(__transfersize(12) uint16_t const * ptr
) // VLD3.16 {d0, d1, d2}, [r0]
10060 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3,
10062 __m128i val0
, val1
, val2
, tmp0
, tmp1
;
10063 _NEON2SSE_ALIGN_16
int8_t mask16
[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
10064 val0
= vld1q_u16 (ptr
); //a0,a1,a2,a3, b0,b1,b2,b3
10065 val2
= _mm_loadl_epi64((__m128i
*)(ptr
+ 8)); //c0,c1,c2,c3, x,x,x,x
10067 tmp0
= _mm_shuffle_epi8(val0
, *(__m128i
*)mask16
); //a0, a3, b2,a1, b0, b3, a2, b1
10068 tmp1
= _mm_shufflelo_epi16(val2
, 201); //11 00 10 01 : c1, c2, c0, c3,
10069 val0
= _mm_slli_si128(tmp0
,10);
10070 val0
= _mm_srli_si128(val0
,10); //a0, a3, b2, 0,0, 0,0,
10071 val2
= _mm_slli_si128(tmp1
,14); //0,0,0,0,0,0,0,c1
10072 val2
= _mm_srli_si128(val2
,8); //0,0,0,c1,0,0,0,0
10073 val0
= _mm_or_si128(val0
,val2
); //a0, a3, b2, c1, x,x,x,x
10074 _M64(v
.val
[0], val0
);
10076 val1
= _mm_slli_si128(tmp0
,4); //0,0,0,0,0,a1, b0, b3
10077 val1
= _mm_srli_si128(val1
,10); //a1, b0, b3, 0,0, 0,0,
10078 val2
= _mm_srli_si128(tmp1
,2); //c2, 0,0,0,0,0,0,0,
10079 val2
= _mm_slli_si128(val2
,6); //0,0,0,c2,0,0,0,0
10080 val1
= _mm_or_si128(val1
,val2
); //a1, b0, b3, c2, x,x,x,x
10081 _M64(v
.val
[1], val1
);
10083 tmp0
= _mm_srli_si128(tmp0
,12); //a2, b1,0,0,0,0,0,0
10084 tmp1
= _mm_srli_si128(tmp1
,4);
10085 tmp1
= _mm_slli_si128(tmp1
,4); //0,0,c0, c3,
10086 val2
= _mm_or_si128(tmp0
, tmp1
); //a2, b1, c0, c3,
10087 _M64(v
.val
[2], val2
);
10091 uint32x2x3_t
vld3_u32(__transfersize(6) uint32_t const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
10092 _NEON2SSE_INLINE uint32x2x3_t
vld3_u32(__transfersize(6) uint32_t const * ptr
) // VLD3.32 {d0, d1, d2}, [r0]
10094 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10096 __m128i val0
, val1
, val2
;
10097 val0
= vld1q_u32 (ptr
); //a0,a1, b0,b1,
10098 val2
= _mm_loadl_epi64((__m128i
*) (ptr
+ 4)); //c0,c1, x,x
10100 val0
= _mm_shuffle_epi32(val0
, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
10101 _M64(v
.val
[0], val0
);
10102 val2
= _mm_slli_si128(val2
, 8); //x, x,c0,c1,
10103 val1
= _mm_unpackhi_epi32(val0
,val2
); //a1,c0, b0, c1
10104 _M64(v
.val
[1], val1
);
10105 val2
= _mm_srli_si128(val1
, 8); //b0, c1, x, x,
10106 _M64(v
.val
[2], val2
);
10109 uint64x1x3_t
vld3_u64(__transfersize(3) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
10110 _NEON2SSE_INLINE uint64x1x3_t
vld3_u64(__transfersize(3) uint64_t const * ptr
) // VLD1.64 {d0, d1, d2}, [r0]
10113 v
.val
[0].m64_u64
[0] = *(ptr
);
10114 v
.val
[1].m64_u64
[0] = *(ptr
+ 1);
10115 v
.val
[2].m64_u64
[0] = *(ptr
+ 2);
10119 int8x8x3_t
vld3_s8(__transfersize(24) int8_t const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
10120 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
10122 int16x4x3_t
vld3_s16(__transfersize(12) int16_t const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
10123 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
10125 int32x2x3_t
vld3_s32(__transfersize(6) int32_t const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
10126 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
10128 int64x1x3_t
vld3_s64(__transfersize(3) int64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
10129 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
10131 float16x4x3_t
vld3_f16(__transfersize(12) __fp16
const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
10132 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10134 float32x2x3_t
vld3_f32(__transfersize(6) float32_t
const * ptr
); // VLD3.32 {d0, d1, d2}, [r0]
10135 _NEON2SSE_INLINE float32x2x3_t
vld3_f32(__transfersize(6) float32_t
const * ptr
)
10137 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1
10139 v
.val
[0].m64_f32
[0] = *(ptr
);
10140 v
.val
[0].m64_f32
[1] = *(ptr
+ 3);
10142 v
.val
[1].m64_f32
[0] = *(ptr
+ 1);
10143 v
.val
[1].m64_f32
[1] = *(ptr
+ 4);
10145 v
.val
[2].m64_f32
[0] = *(ptr
+ 2);
10146 v
.val
[2].m64_f32
[1] = *(ptr
+ 5);
10150 poly8x8x3_t
vld3_p8(__transfersize(24) poly8_t
const * ptr
); // VLD3.8 {d0, d1, d2}, [r0]
10151 #define vld3_p8 vld3_u8
10153 poly16x4x3_t
vld3_p16(__transfersize(12) poly16_t
const * ptr
); // VLD3.16 {d0, d1, d2}, [r0]
10154 #define vld3_p16 vld3_u16
10156 //*************** Quadruples load ********************************
10157 //*****************************************************************
10158 uint8x16x4_t
vld4q_u8(__transfersize(64) uint8_t const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
10159 _NEON2SSE_INLINE uint8x16x4_t
vld4q_u8(__transfersize(64) uint8_t const * ptr
) // VLD4.8 {d0, d2, d4, d6}, [r0]
10162 __m128i tmp3
, tmp2
, tmp1
, tmp0
;
10164 v
.val
[0] = vld1q_u8 ( ptr
); //a0,a1,a2,...a7, ...a15
10165 v
.val
[1] = vld1q_u8 ( (ptr
+ 16)); //b0, b1,b2,...b7.... b15
10166 v
.val
[2] = vld1q_u8 ( (ptr
+ 32)); //c0, c1,c2,...c7....c15
10167 v
.val
[3] = vld1q_u8 ( (ptr
+ 48)); //d0,d1,d2,...d7....d15
10169 tmp0
= _mm_unpacklo_epi8(v
.val
[0],v
.val
[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
10170 tmp1
= _mm_unpacklo_epi8(v
.val
[2],v
.val
[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
10171 tmp2
= _mm_unpackhi_epi8(v
.val
[0],v
.val
[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
10172 tmp3
= _mm_unpackhi_epi8(v
.val
[2],v
.val
[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
10174 v
.val
[0] = _mm_unpacklo_epi8(tmp0
, tmp2
); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11
10175 v
.val
[1] = _mm_unpackhi_epi8(tmp0
, tmp2
); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
10176 v
.val
[2] = _mm_unpacklo_epi8(tmp1
, tmp3
); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
10177 v
.val
[3] = _mm_unpackhi_epi8(tmp1
, tmp3
); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
10179 tmp0
= _mm_unpacklo_epi32(v
.val
[0], v
.val
[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
10180 tmp1
= _mm_unpackhi_epi32(v
.val
[0], v
.val
[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
10181 tmp2
= _mm_unpacklo_epi32(v
.val
[1], v
.val
[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
10182 tmp3
= _mm_unpackhi_epi32(v
.val
[1], v
.val
[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
10184 v
.val
[0] = _mm_unpacklo_epi8(tmp0
, tmp2
); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
10185 v
.val
[1] = _mm_unpackhi_epi8(tmp0
, tmp2
); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
10186 v
.val
[2] = _mm_unpacklo_epi8(tmp1
, tmp3
); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
10187 v
.val
[3] = _mm_unpackhi_epi8(tmp1
, tmp3
); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
10191 uint16x8x4_t
vld4q_u16(__transfersize(32) uint16_t const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
10192 _NEON2SSE_INLINE uint16x8x4_t
vld4q_u16(__transfersize(32) uint16_t const * ptr
) // VLD4.16 {d0, d2, d4, d6}, [r0]
10195 __m128i tmp3
, tmp2
, tmp1
, tmp0
;
10196 tmp0
= vld1q_u16 (ptr
); //a0,a1,a2,...a7
10197 tmp1
= vld1q_u16 ((ptr
+ 8)); //b0, b1,b2,...b7
10198 tmp2
= vld1q_u16 ((ptr
+ 16)); //c0, c1,c2,...c7
10199 tmp3
= vld1q_u16 ((ptr
+ 24)); //d0,d1,d2,...d7
10200 v
.val
[0] = _mm_unpacklo_epi16(tmp0
,tmp1
); //a0,b0, a1,b1, a2,b2, a3,b3,
10201 v
.val
[1] = _mm_unpacklo_epi16(tmp2
,tmp3
); //c0,d0, c1,d1, c2,d2, c3,d3,
10202 v
.val
[2] = _mm_unpackhi_epi16(tmp0
,tmp1
); //a4,b4, a5,b5, a6,b6, a7,b7
10203 v
.val
[3] = _mm_unpackhi_epi16(tmp2
,tmp3
); //c4,d4, c5,d5, c6,d6, c7,d7
10204 tmp0
= _mm_unpacklo_epi16(v
.val
[0], v
.val
[2]); //a0,a4, b0,b4, a1,a5, b1,b5
10205 tmp1
= _mm_unpackhi_epi16(v
.val
[0], v
.val
[2]); //a2,a6, b2,b6, a3,a7, b3,b7
10206 tmp2
= _mm_unpacklo_epi16(v
.val
[1], v
.val
[3]); //c0,c4, d0,d4, c1,c5, d1,d5
10207 tmp3
= _mm_unpackhi_epi16(v
.val
[1], v
.val
[3]); //c2,c6, d2,d6, c3,c7, d3,d7
10208 v
.val
[0] = _mm_unpacklo_epi64(tmp0
, tmp2
); //a0,a4, b0,b4, c0,c4, d0,d4,
10209 v
.val
[1] = _mm_unpackhi_epi64(tmp0
, tmp2
); //a1,a5, b1,b5, c1,c5, d1,d5
10210 v
.val
[2] = _mm_unpacklo_epi64(tmp1
, tmp3
); //a2,a6, b2,b6, c2,c6, d2,d6,
10211 v
.val
[3] = _mm_unpackhi_epi64(tmp1
, tmp3
); //a3,a7, b3,b7, c3,c7, d3,d7
10215 uint32x4x4_t
vld4q_u32(__transfersize(16) uint32_t const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
10216 _NEON2SSE_INLINE uint32x4x4_t
vld4q_u32(__transfersize(16) uint32_t const * ptr
) // VLD4.32 {d0, d2, d4, d6}, [r0]
10219 __m128i tmp3
, tmp2
, tmp1
, tmp0
;
10220 v
.val
[0] = vld1q_u32 (ptr
);
10221 v
.val
[1] = vld1q_u32 ((ptr
+ 4));
10222 v
.val
[2] = vld1q_u32 ((ptr
+ 8));
10223 v
.val
[3] = vld1q_u32 ((ptr
+ 12));
10224 tmp0
= _mm_unpacklo_epi32(v
.val
[0],v
.val
[1]);
10225 tmp1
= _mm_unpacklo_epi32(v
.val
[2],v
.val
[3]);
10226 tmp2
= _mm_unpackhi_epi32(v
.val
[0],v
.val
[1]);
10227 tmp3
= _mm_unpackhi_epi32(v
.val
[2],v
.val
[3]);
10228 v
.val
[0] = _mm_unpacklo_epi64(tmp0
, tmp1
);
10229 v
.val
[1] = _mm_unpackhi_epi64(tmp0
, tmp1
);
10230 v
.val
[2] = _mm_unpacklo_epi64(tmp2
, tmp3
);
10231 v
.val
[3] = _mm_unpackhi_epi64(tmp2
, tmp3
);
10235 int8x16x4_t
vld4q_s8(__transfersize(64) int8_t const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
10236 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
10238 int16x8x4_t
vld4q_s16(__transfersize(32) int16_t const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
10239 #define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
10241 int32x4x4_t
vld4q_s32(__transfersize(16) int32_t const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
10242 #define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
10244 float16x8x4_t
vld4q_f16(__transfersize(32) __fp16
const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
10245 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10247 float32x4x4_t
vld4q_f32(__transfersize(16) float32_t
const * ptr
); // VLD4.32 {d0, d2, d4, d6}, [r0]
10248 _NEON2SSE_INLINE float32x4x4_t
vld4q_f32(__transfersize(16) float32_t
const * ptr
) // VLD4.32 {d0, d2, d4, d6}, [r0]
10251 __m128 tmp3
, tmp2
, tmp1
, tmp0
;
10253 v
.val
[0] = vld1q_f32 ((float*) ptr
);
10254 v
.val
[1] = vld1q_f32 ((float*) (ptr
+ 4));
10255 v
.val
[2] = vld1q_f32 ((float*) (ptr
+ 8));
10256 v
.val
[3] = vld1q_f32 ((float*) (ptr
+ 12));
10257 tmp0
= _mm_unpacklo_ps(v
.val
[0], v
.val
[1]);
10258 tmp2
= _mm_unpacklo_ps(v
.val
[2], v
.val
[3]);
10259 tmp1
= _mm_unpackhi_ps(v
.val
[0], v
.val
[1]);
10260 tmp3
= _mm_unpackhi_ps(v
.val
[2], v
.val
[3]);
10261 v
.val
[0] = _mm_movelh_ps(tmp0
, tmp2
);
10262 v
.val
[1] = _mm_movehl_ps(tmp2
, tmp0
);
10263 v
.val
[2] = _mm_movelh_ps(tmp1
, tmp3
);
10264 v
.val
[3] = _mm_movehl_ps(tmp3
, tmp1
);
10268 poly8x16x4_t
vld4q_p8(__transfersize(64) poly8_t
const * ptr
); // VLD4.8 {d0, d2, d4, d6}, [r0]
10269 #define vld4q_p8 vld4q_u8
10271 poly16x8x4_t
vld4q_p16(__transfersize(32) poly16_t
const * ptr
); // VLD4.16 {d0, d2, d4, d6}, [r0]
10272 #define vld4q_p16 vld4q_s16
10274 uint8x8x4_t
vld4_u8(__transfersize(32) uint8_t const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
10275 _NEON2SSE_INLINE uint8x8x4_t
vld4_u8(__transfersize(32) uint8_t const * ptr
) // VLD4.8 {d0, d1, d2, d3}, [r0]
10279 __m128i val0
, val2
;
10280 _NEON2SSE_ALIGN_16
int8_t mask4_8
[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
10282 val0
= vld1q_u8(( ptr
)); //load first 64-bits in val[0] and val[1]
10283 val2
= vld1q_u8(( ptr
+ 16)); //load third and forth 64-bits in val[2], val[3]
10285 sh0
= _mm_shuffle_epi8(val0
, *(__m128i
*)mask4_8
);
10286 sh1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask4_8
);
10287 val0
= _mm_unpacklo_epi32(sh0
,sh1
); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
10288 vst1q_u8(&v
.val
[0], val0
);
10289 val2
= _mm_unpackhi_epi32(sh0
,sh1
); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
10290 vst1q_u8(&v
.val
[2], val2
);
10294 uint16x4x4_t
vld4_u16(__transfersize(16) uint16_t const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
10295 _NEON2SSE_INLINE uint16x4x4_t
vld4_u16(__transfersize(16) uint16_t const * ptr
) // VLD4.16 {d0, d1, d2, d3}, [r0]
10299 __m128i val0
, val2
;
10300 _NEON2SSE_ALIGN_16
int8_t mask4_16
[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
10301 val0
= vld1q_u16 ( (ptr
)); //load first 64-bits in val[0] and val[1]
10302 val2
= vld1q_u16 ( (ptr
+ 8)); //load third and forth 64-bits in val[2], val[3]
10303 sh0
= _mm_shuffle_epi8(val0
, *(__m128i
*)mask4_16
);
10304 sh1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask4_16
);
10305 val0
= _mm_unpacklo_epi32(sh0
,sh1
); //0,4,8,12, 1,5,9,13
10306 vst1q_u16(&v
.val
[0], val0
);
10307 val2
= _mm_unpackhi_epi32(sh0
,sh1
); //2,6,10,14, 3,7,11,15
10308 vst1q_u16(&v
.val
[2], val2
);
10312 uint32x2x4_t
vld4_u32(__transfersize(8) uint32_t const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
10313 _NEON2SSE_INLINE uint32x2x4_t
vld4_u32(__transfersize(8) uint32_t const * ptr
)
10315 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10317 __m128i val0
, val01
, val2
;
10318 val0
= vld1q_u32 (ptr
); //a0,a1, b0,b1,
10319 val2
= vld1q_u32 ((ptr
+ 4)); //c0,c1, d0,d1
10320 val01
= _mm_unpacklo_epi32(val0
,val2
); //a0, c0, a1,c1,
10321 val2
= _mm_unpackhi_epi32(val0
,val2
); //b0,d0, b1, d1
10322 vst1q_u32(&v
.val
[0], val01
);
10323 vst1q_u32(&v
.val
[2], val2
);
10327 uint64x1x4_t
vld4_u64(__transfersize(4) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
10328 _NEON2SSE_INLINE uint64x1x4_t
vld4_u64(__transfersize(4) uint64_t const * ptr
) // VLD1.64 {d0, d1, d2, d3}, [r0]
10331 v
.val
[0].m64_u64
[0] = *(ptr
); //load first 64-bits in val[0] and val[1]
10332 v
.val
[1].m64_u64
[0] = *(ptr
+ 1); //load first 64-bits in val[0] and val[1]
10333 v
.val
[2].m64_u64
[0] = *(ptr
+ 2); //load third and forth 64-bits in val[2], val[3]
10334 v
.val
[3].m64_u64
[0] = *(ptr
+ 3); //load third and forth 64-bits in val[2], val[3]
10338 int8x8x4_t
vld4_s8(__transfersize(32) int8_t const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
10339 #define vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
10341 int16x4x4_t
vld4_s16(__transfersize(16) int16_t const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
10342 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
10344 int32x2x4_t
vld4_s32(__transfersize(8) int32_t const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
10345 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
10347 int64x1x4_t
vld4_s64(__transfersize(4) int64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
10348 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
10350 float16x4x4_t
vld4_f16(__transfersize(16) __fp16
const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
10351 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10353 float32x2x4_t
vld4_f32(__transfersize(8) float32_t
const * ptr
); // VLD4.32 {d0, d1, d2, d3}, [r0]
10354 _NEON2SSE_INLINE float32x2x4_t
vld4_f32(__transfersize(8) float32_t
const * ptr
) // VLD4.32 {d0, d1, d2, d3}, [r0]
10356 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
10358 res
.val
[0].m64_f32
[0] = *(ptr
);
10359 res
.val
[0].m64_f32
[1] = *(ptr
+ 4);
10360 res
.val
[1].m64_f32
[0] = *(ptr
+ 1);
10361 res
.val
[1].m64_f32
[1] = *(ptr
+ 5);
10362 res
.val
[2].m64_f32
[0] = *(ptr
+ 2);
10363 res
.val
[2].m64_f32
[1] = *(ptr
+ 6);
10364 res
.val
[3].m64_f32
[0] = *(ptr
+ 3);
10365 res
.val
[3].m64_f32
[1] = *(ptr
+ 7);
10369 poly8x8x4_t
vld4_p8(__transfersize(32) poly8_t
const * ptr
); // VLD4.8 {d0, d1, d2, d3}, [r0]
10370 #define vld4_p8 vld4_u8
10372 poly16x4x4_t
vld4_p16(__transfersize(16) poly16_t
const * ptr
); // VLD4.16 {d0, d1, d2, d3}, [r0]
10373 #define vld4_p16 vld4_u16
10375 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
10376 //*******************************************************************************************************************
10377 uint8x8x2_t
vld2_dup_u8(__transfersize(2) uint8_t const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
10378 _NEON2SSE_INLINE uint8x8x2_t
vld2_dup_u8(__transfersize(2) uint8_t const * ptr
) // VLD2.8 {d0[], d1[]}, [r0]
10381 __m128i val0
, val1
;
10382 val0
= LOAD_SI128(ptr
); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
10383 val1
= _mm_unpacklo_epi8(val0
,val0
); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
10384 val1
= _mm_unpacklo_epi16(val1
,val1
); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
10385 val0
= _mm_unpacklo_epi32(val1
,val1
); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10386 vst1q_u8(v
.val
, val0
);
10390 uint16x4x2_t
vld2_dup_u16(__transfersize(2) uint16_t const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
10391 _NEON2SSE_INLINE uint16x4x2_t
vld2_dup_u16(__transfersize(2) uint16_t const * ptr
) // VLD2.16 {d0[], d1[]}, [r0]
10394 __m128i val0
, val1
;
10395 val1
= LOAD_SI128(ptr
); //0,1,x,x, x,x,x,x
10396 val0
= _mm_shufflelo_epi16(val1
, 0); //00 00 00 00 (all 0)
10397 _M64(v
.val
[0], val0
);
10398 val1
= _mm_shufflelo_epi16(val1
, 85); //01 01 01 01 (all 1)
10399 _M64(v
.val
[1], val1
);
10403 uint32x2x2_t
vld2_dup_u32(__transfersize(2) uint32_t const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
10404 _NEON2SSE_INLINE uint32x2x2_t
vld2_dup_u32(__transfersize(2) uint32_t const * ptr
) // VLD2.32 {d0[], d1[]}, [r0]
10408 val0
= LOAD_SI128(ptr
); //0,1,x,x
10409 val0
= _mm_shuffle_epi32(val0
, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
10410 vst1q_u32(v
.val
, val0
);
10414 uint64x1x2_t
vld2_dup_u64(__transfersize(2) uint64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
10415 #define vld2_dup_u64 vld2_u64
10417 int8x8x2_t
vld2_dup_s8(__transfersize(2) int8_t const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
10418 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
10420 int16x4x2_t
vld2_dup_s16(__transfersize(2) int16_t const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
10421 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
10423 int32x2x2_t
vld2_dup_s32(__transfersize(2) int32_t const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
10424 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
10426 int64x1x2_t
vld2_dup_s64(__transfersize(2) int64_t const * ptr
); // VLD1.64 {d0, d1}, [r0]
10427 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
10429 float16x4x2_t
vld2_dup_f16(__transfersize(2) __fp16
const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
10430 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10432 float32x2x2_t
vld2_dup_f32(__transfersize(2) float32_t
const * ptr
); // VLD2.32 {d0[], d1[]}, [r0]
10433 _NEON2SSE_INLINE float32x2x2_t
vld2_dup_f32(__transfersize(2) float32_t
const * ptr
) // VLD2.32 {d0[], d1[]}, [r0]
10436 v
.val
[0].m64_f32
[0] = *(ptr
); //0,0
10437 v
.val
[0].m64_f32
[1] = *(ptr
); //0,0
10438 v
.val
[1].m64_f32
[0] = *(ptr
+ 1); //1,1
10439 v
.val
[1].m64_f32
[1] = *(ptr
+ 1); //1,1
10443 poly8x8x2_t
vld2_dup_p8(__transfersize(2) poly8_t
const * ptr
); // VLD2.8 {d0[], d1[]}, [r0]
10444 #define vld2_dup_p8 vld2_dup_u8
10446 poly16x4x2_t
vld2_dup_p16(__transfersize(2) poly16_t
const * ptr
); // VLD2.16 {d0[], d1[]}, [r0]
10447 #define vld2_dup_p16 vld2_dup_s16
10449 //************* Duplicate (or propagate)triplets: *******************
10450 //********************************************************************
10451 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
10452 uint8x8x3_t
vld3_dup_u8(__transfersize(3) uint8_t const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10453 _NEON2SSE_INLINE uint8x8x3_t
vld3_dup_u8(__transfersize(3) uint8_t const * ptr
) // VLD3.8 {d0[], d1[], d2[]}, [r0]
10456 __m128i val0
, val1
, val2
;
10457 val0
= LOAD_SI128(ptr
); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
10458 val1
= _mm_unpacklo_epi8(val0
,val0
); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
10459 val1
= _mm_unpacklo_epi16(val1
,val1
); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
10460 val0
= _mm_unpacklo_epi32(val1
,val1
); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10461 val2
= _mm_unpackhi_epi32(val1
,val1
); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
10462 vst1q_u8(v
.val
, val0
);
10463 _M64(v
.val
[2], val2
);
10467 uint16x4x3_t
vld3_dup_u16(__transfersize(3) uint16_t const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10468 _NEON2SSE_INLINE uint16x4x3_t
vld3_dup_u16(__transfersize(3) uint16_t const * ptr
) // VLD3.16 {d0[], d1[], d2[]}, [r0]
10471 __m128i val0
, val1
, val2
;
10472 val2
= LOAD_SI128(ptr
); //0,1,2,x, x,x,x,x
10473 val0
= _mm_shufflelo_epi16(val2
, 0); //00 00 00 00 (all 0)
10474 val1
= _mm_shufflelo_epi16(val2
, 85); //01 01 01 01 (all 1)
10475 val2
= _mm_shufflelo_epi16(val2
, 170); //10 10 10 10 (all 2)
10476 _M64(v
.val
[0], val0
);
10477 _M64(v
.val
[1], val1
);
10478 _M64(v
.val
[2], val2
);
10482 uint32x2x3_t
vld3_dup_u32(__transfersize(3) uint32_t const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10483 _NEON2SSE_INLINE uint32x2x3_t
vld3_dup_u32(__transfersize(3) uint32_t const * ptr
) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10486 __m128i val0
, val1
, val2
;
10487 val2
= LOAD_SI128(ptr
); //0,1,2,x
10488 val0
= _mm_shuffle_epi32(val2
, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
10489 val1
= _mm_shuffle_epi32(val2
, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
10490 val2
= _mm_srli_si128(val0
, 8); //2,2,0x0,0x0
10491 _M64(v
.val
[0], val0
);
10492 _M64(v
.val
[1], val1
);
10493 _M64(v
.val
[2], val2
);
10497 uint64x1x3_t
vld3_dup_u64(__transfersize(3) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
10498 _NEON2SSE_INLINE uint64x1x3_t
vld3_dup_u64(__transfersize(3) uint64_t const * ptr
) // VLD1.64 {d0, d1, d2}, [r0]
10501 v
.val
[0].m64_u64
[0] = *(ptr
);
10502 v
.val
[1].m64_u64
[0] = *(ptr
+ 1);
10503 v
.val
[2].m64_u64
[0] = *(ptr
+ 2);
10507 int8x8x3_t
vld3_dup_s8(__transfersize(3) int8_t const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10508 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
10510 int16x4x3_t
vld3_dup_s16(__transfersize(3) int16_t const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10511 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
10513 int32x2x3_t
vld3_dup_s32(__transfersize(3) int32_t const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10514 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
10516 int64x1x3_t
vld3_dup_s64(__transfersize(3) int64_t const * ptr
); // VLD1.64 {d0, d1, d2}, [r0]
10517 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
10520 float16x4x3_t
vld3_dup_f16(__transfersize(3) __fp16
const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10521 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10523 float32x2x3_t
vld3_dup_f32(__transfersize(3) float32_t
const * ptr
); // VLD3.32 {d0[], d1[], d2[]}, [r0]
10524 _NEON2SSE_INLINE float32x2x3_t
vld3_dup_f32(__transfersize(3) float32_t
const * ptr
) // VLD3.32 {d0[], d1[], d2[]}, [r0]
10528 for (i
= 0; i
<3; i
++) {
10529 v
.val
[i
].m64_f32
[0] = *(ptr
+ i
);
10530 v
.val
[i
].m64_f32
[1] = *(ptr
+ i
);
10535 poly8x8x3_t
vld3_dup_p8(__transfersize(3) poly8_t
const * ptr
); // VLD3.8 {d0[], d1[], d2[]}, [r0]
10536 #define vld3_dup_p8 vld3_dup_u8
10538 poly16x4x3_t
vld3_dup_p16(__transfersize(3) poly16_t
const * ptr
); // VLD3.16 {d0[], d1[], d2[]}, [r0]
10539 #define vld3_dup_p16 vld3_dup_s16
10542 //************* Duplicate (or propagate) quadruples: *******************
10543 //***********************************************************************
10544 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes
10545 uint8x8x4_t
vld4_dup_u8(__transfersize(4) uint8_t const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10546 _NEON2SSE_INLINE uint8x8x4_t
vld4_dup_u8(__transfersize(4) uint8_t const * ptr
) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10549 __m128i val0
, val1
, val2
;
10550 val0
= LOAD_SI128(ptr
); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
10551 val1
= _mm_unpacklo_epi8(val0
,val0
); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
10552 val1
= _mm_unpacklo_epi16(val1
,val1
); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
10553 val0
= _mm_unpacklo_epi32(val1
,val1
); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
10554 val2
= _mm_unpackhi_epi32(val1
,val1
); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
10555 vst1q_u8(&v
.val
[0], val0
);
10556 vst1q_u8(&v
.val
[2], val2
);
10560 uint16x4x4_t
vld4_dup_u16(__transfersize(4) uint16_t const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10561 _NEON2SSE_INLINE uint16x4x4_t
vld4_dup_u16(__transfersize(4) uint16_t const * ptr
) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10564 __m128i val0
, val1
, val2
, val3
;
10565 val3
= LOAD_SI128(ptr
); //0,1,2,3, x,x,x,x
10566 val0
= _mm_shufflelo_epi16(val3
, 0); //00 00 00 00 (all 0)
10567 val1
= _mm_shufflelo_epi16(val3
, 85); //01 01 01 01 (all 1)
10568 val2
= _mm_shufflelo_epi16(val3
, 170); //10 10 10 10 (all 2)
10569 val3
= _mm_shufflelo_epi16(val3
, 255); //11 11 11 11 (all 3)
10570 _M64(v
.val
[0], val0
);
10571 _M64(v
.val
[1], val1
);
10572 _M64(v
.val
[2], val2
);
10573 _M64(v
.val
[3], val3
);
10577 uint32x2x4_t
vld4_dup_u32(__transfersize(4) uint32_t const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10578 _NEON2SSE_INLINE uint32x2x4_t
vld4_dup_u32(__transfersize(4) uint32_t const * ptr
) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10581 __m128i val0
, val1
, val2
, val3
;
10582 val3
= LOAD_SI128(ptr
); //0,1,2,3
10583 val0
= _mm_shuffle_epi32(val3
, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
10584 val1
= _mm_shuffle_epi32(val3
, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
10585 val2
= _mm_shuffle_epi32(val3
, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
10586 val3
= _mm_shuffle_epi32(val3
, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
10587 _M64(v
.val
[0], val0
);
10588 _M64(v
.val
[1], val1
);
10589 _M64(v
.val
[2], val2
);
10590 _M64(v
.val
[3], val3
);
10594 uint64x1x4_t
vld4_dup_u64(__transfersize(4) uint64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
10595 _NEON2SSE_INLINE uint64x1x4_t
vld4_dup_u64(__transfersize(4) uint64_t const * ptr
) // VLD1.64 {d0, d1, d2, d3}, [r0]
10598 v
.val
[0].m64_u64
[0] = *(ptr
);
10599 v
.val
[1].m64_u64
[0] = *(ptr
+ 1);
10600 v
.val
[2].m64_u64
[0] = *(ptr
+ 2);
10601 v
.val
[3].m64_u64
[0] = *(ptr
+ 3);
10605 int8x8x4_t
vld4_dup_s8(__transfersize(4) int8_t const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10606 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
10608 int16x4x4_t
vld4_dup_s16(__transfersize(4) int16_t const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10609 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
10611 int32x2x4_t
vld4_dup_s32(__transfersize(4) int32_t const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10612 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
10614 int64x1x4_t
vld4_dup_s64(__transfersize(4) int64_t const * ptr
); // VLD1.64 {d0, d1, d2, d3}, [r0]
10615 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
10617 float16x4x4_t
vld4_dup_f16(__transfersize(4) __fp16
const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10618 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
10620 float32x2x4_t
vld4_dup_f32(__transfersize(4) float32_t
const * ptr
); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10621 _NEON2SSE_INLINE float32x2x4_t
vld4_dup_f32(__transfersize(4) float32_t
const * ptr
) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
10625 for (i
= 0; i
<4; i
++) {
10626 v
.val
[i
].m64_f32
[0] = *(ptr
+ i
);
10627 v
.val
[i
].m64_f32
[1] = *(ptr
+ i
);
10632 poly8x8x4_t
vld4_dup_p8(__transfersize(4) poly8_t
const * ptr
); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
10633 #define vld4_dup_p8 vld4_dup_u8
10635 poly16x4x4_t
vld4_dup_p16(__transfersize(4) poly16_t
const * ptr
); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
10636 #define vld4_dup_p16 vld4_dup_u16
10639 //**********************************************************************************
10640 //*******************Lane loads for an N-element structures ***********************
10641 //**********************************************************************************
10642 //********************** Lane pairs ************************************************
10643 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
10644 //we assume src is 16 bit aligned
10646 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
10647 //to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined
10649 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10650 _NEON2SSE_INLINE uint16x8x2_t
vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr
, uint16x8x2_t
* src
,__constrange(0,7) int lane
) // VLD2.16 {d0[0], d2[0]}, [r0]
10653 v
.val
[0] = vld1q_lane_s16 (ptr
, src
->val
[0], lane
);
10654 v
.val
[1] = vld1q_lane_s16 ((ptr
+ 1), src
->val
[1], lane
);
10657 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
10659 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10660 _NEON2SSE_INLINE uint32x4x2_t
vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr
, uint32x4x2_t
* src
,__constrange(0,3) int lane
) // VLD2.32 {d0[0], d2[0]}, [r0]
10663 v
.val
[0] = _MM_INSERT_EPI32 (src
->val
[0], ptr
[0], lane
);
10664 v
.val
[1] = _MM_INSERT_EPI32 (src
->val
[1], ptr
[1], lane
);
10667 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
10669 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10670 _NEON2SSE_INLINE int16x8x2_t
vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr
, int16x8x2_t
* src
, __constrange(0,7) int lane
)
10673 v
.val
[0] = vld1q_lane_s16 (ptr
, src
->val
[0], lane
);
10674 v
.val
[1] = vld1q_lane_s16 ((ptr
+ 1), src
->val
[1], lane
);
10677 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
10679 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10680 _NEON2SSE_INLINE int32x4x2_t
vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr
, int32x4x2_t
* src
, __constrange(0,3) int lane
)
10683 v
.val
[0] = _MM_INSERT_EPI32 (src
->val
[0], ptr
[0], lane
);
10684 v
.val
[1] = _MM_INSERT_EPI32 (src
->val
[1], ptr
[1], lane
);
10687 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
10689 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10690 //current IA SIMD doesn't support float16
10692 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
10693 _NEON2SSE_INLINE float32x4x2_t
vld2q_lane_f32_ptr(__transfersize(2) float32_t
const * ptr
, float32x4x2_t
* src
,__constrange(0,3) int lane
) // VLD2.32 {d0[0], d2[0]}, [r0]
10696 v
.val
[0] = vld1q_lane_f32(ptr
, src
->val
[0], lane
);
10697 v
.val
[1] = vld1q_lane_f32((ptr
+ 1), src
->val
[1], lane
);
10700 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
10702 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
10703 #define vld2q_lane_p16 vld2q_lane_u16
10705 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10706 _NEON2SSE_INLINE uint8x8x2_t
vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr
, uint8x8x2_t
* src
, __constrange(0,7) int lane
) // VLD2.8 {d0[0], d1[0]}, [r0]
10709 v
.val
[0] = vld1_lane_u8(ptr
, src
->val
[0], lane
);
10710 v
.val
[1] = vld1_lane_u8((ptr
+ 1), src
->val
[1], lane
);
10713 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
10715 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10716 _NEON2SSE_INLINE uint16x4x2_t
vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr
, uint16x4x2_t
* src
, __constrange(0,3) int lane
)
10719 v
.val
[0] = vld1_lane_u16(ptr
, src
->val
[0], lane
);
10720 v
.val
[1] = vld1_lane_u16((ptr
+ 1), src
->val
[1], lane
);
10723 #define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
10725 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10726 _NEON2SSE_INLINE uint32x2x2_t
vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr
, uint32x2x2_t
* src
, __constrange(0,1) int lane
)
10729 v
.val
[0] = vld1_lane_u32(ptr
, src
->val
[0], lane
);
10730 v
.val
[1] = vld1_lane_u32((ptr
+ 1), src
->val
[1], lane
);
10733 #define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
10735 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10736 int8x8x2_t
vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr
, int8x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.8 {d0[0], d1[0]}, [r0]
10737 #define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane)
10739 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10740 int16x4x2_t
vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr
, int16x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.16 {d0[0], d1[0]}, [r0]
10741 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
10743 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
10744 int32x2x2_t
vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr
, int32x2x2_t
* src
, __constrange(0,1) int lane
); // VLD2.32 {d0[0], d1[0]}, [r0]
10745 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
10747 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
10748 //current IA SIMD doesn't support float16
10750 float32x2x2_t
vld2_lane_f32_ptr(__transfersize(2) float32_t
const * ptr
, float32x2x2_t
* src
,__constrange(0,1) int lane
); // VLD2.32 {d0[0], d1[0]}, [r0]
10751 _NEON2SSE_INLINE float32x2x2_t
vld2_lane_f32_ptr(__transfersize(2) float32_t
const * ptr
, float32x2x2_t
* src
,__constrange(0,1) int lane
)
10754 v
.val
[0] = vld1_lane_f32(ptr
, src
->val
[0], lane
);
10755 v
.val
[1] = vld1_lane_f32((ptr
+ 1), src
->val
[1], lane
);
10758 #define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
10760 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
10761 poly8x8x2_t
vld2_lane_p8_ptr(__transfersize(2) poly8_t
const * ptr
, poly8x8x2_t
* src
, __constrange(0,7) int lane
); // VLD2.8 {d0[0], d1[0]}, [r0]
10762 #define vld2_lane_p8 vld2_lane_u8
10764 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
10765 poly16x4x2_t
vld2_lane_p16_ptr(__transfersize(2) poly16_t
const * ptr
, poly16x4x2_t
* src
, __constrange(0,3) int lane
); // VLD2.16 {d0[0], d1[0]}, [r0]
10766 #define vld2_lane_p16 vld2_lane_u16
10768 //*********** Lane triplets **********************
10769 //*************************************************
10770 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
10771 //we assume src is 16 bit aligned
10773 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10774 _NEON2SSE_INLINE uint16x8x3_t
vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr
, uint16x8x3_t
* src
,__constrange(0,7) int lane
) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10777 v
.val
[0] = _MM_INSERT_EPI16 ( src
->val
[0], ptr
[0], lane
);
10778 v
.val
[1] = _MM_INSERT_EPI16 ( src
->val
[1], ptr
[1], lane
);
10779 v
.val
[2] = _MM_INSERT_EPI16 ( src
->val
[2], ptr
[2], lane
);
10782 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
10784 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10785 _NEON2SSE_INLINE uint32x4x3_t
vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr
, uint32x4x3_t
* src
,__constrange(0,3) int lane
) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10788 v
.val
[0] = _MM_INSERT_EPI32 ( src
->val
[0], ptr
[0], lane
);
10789 v
.val
[1] = _MM_INSERT_EPI32 ( src
->val
[1], ptr
[1], lane
);
10790 v
.val
[2] = _MM_INSERT_EPI32 ( src
->val
[2], ptr
[2], lane
);
10793 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
10795 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10796 _NEON2SSE_INLINE int16x8x3_t
vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr
, int16x8x3_t
* src
, __constrange(0,7) int lane
) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10799 v
.val
[0] = _MM_INSERT_EPI16 ( src
->val
[0], ptr
[0], lane
);
10800 v
.val
[1] = _MM_INSERT_EPI16 ( src
->val
[1], ptr
[1], lane
);
10801 v
.val
[2] = _MM_INSERT_EPI16 ( src
->val
[2], ptr
[2], lane
);
10804 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
10806 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10807 _NEON2SSE_INLINE int32x4x3_t
vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr
, int32x4x3_t
* src
, __constrange(0,3) int lane
) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10810 v
.val
[0] = _MM_INSERT_EPI32 ( src
->val
[0], ptr
[0], lane
);
10811 v
.val
[1] = _MM_INSERT_EPI32 ( src
->val
[1], ptr
[1], lane
);
10812 v
.val
[2] = _MM_INSERT_EPI32 ( src
->val
[2], ptr
[2], lane
);
10815 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
10817 float16x8x3_t
vld3q_lane_f16_ptr(__transfersize(3) __fp16
const * ptr
, float16x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10818 //current IA SIMD doesn't support float16
10819 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
10822 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10823 _NEON2SSE_INLINE float32x4x3_t
vld3q_lane_f32_ptr(__transfersize(3) float32_t
const * ptr
, float32x4x3_t
* src
,__constrange(0,3) int lane
) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
10826 v
.val
[0] = vld1q_lane_f32(&ptr
[0], src
->val
[0], lane
);
10827 v
.val
[1] = vld1q_lane_f32(&ptr
[1], src
->val
[1], lane
);
10828 v
.val
[2] = vld1q_lane_f32(&ptr
[2], src
->val
[2], lane
);
10831 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
10833 poly16x8x3_t
vld3q_lane_p16_ptr(__transfersize(3) poly16_t
const * ptr
, poly16x8x3_t
* src
,__constrange(0,7) int lane
); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
10834 #define vld3q_lane_p16 vld3q_lane_u16
10836 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10837 _NEON2SSE_INLINE uint8x8x3_t
vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr
, uint8x8x3_t
* src
, __constrange(0,7) int lane
) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10840 v
.val
[0] = vld1_lane_u8(ptr
, src
->val
[0], lane
);
10841 v
.val
[1] = vld1_lane_u8((ptr
+ 1), src
->val
[1], lane
);
10842 v
.val
[2] = vld1_lane_u8((ptr
+ 2), src
->val
[2], lane
);
10845 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
10847 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10848 _NEON2SSE_INLINE uint16x4x3_t
vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr
, uint16x4x3_t
* src
, __constrange(0,3) int lane
) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10851 v
.val
[0] = vld1_lane_u16(ptr
, src
->val
[0], lane
);
10852 v
.val
[1] = vld1_lane_u16((ptr
+ 1), src
->val
[1], lane
);
10853 v
.val
[2] = vld1_lane_u16((ptr
+ 2), src
->val
[2], lane
);
10856 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
10858 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10859 _NEON2SSE_INLINE uint32x2x3_t
vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr
, uint32x2x3_t
* src
, __constrange(0,1) int lane
) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10861 //need to merge into 128 bit anyway
10863 v
.val
[0] = vld1_lane_u32(ptr
, src
->val
[0], lane
);;
10864 v
.val
[1] = vld1_lane_u32((ptr
+ 1), src
->val
[1], lane
);;
10865 v
.val
[2] = vld1_lane_u32((ptr
+ 2), src
->val
[2], lane
);;
10868 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
10870 int8x8x3_t
vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr
, int8x8x3_t
* src
, __constrange(0,7) int lane
); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10871 #define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
10873 int16x4x3_t
vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr
, int16x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10874 #define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
10876 int32x2x3_t
vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr
, int32x2x3_t
* src
, __constrange(0,1) int lane
); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10877 #define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
10879 float16x4x3_t
vld3_lane_f16_ptr(__transfersize(3) __fp16
const * ptr
, float16x4x3_t
* src
, __constrange(0,3) int lane
); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10880 //current IA SIMD doesn't support float16
10882 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10883 _NEON2SSE_INLINE float32x2x3_t
vld3_lane_f32_ptr(__transfersize(3) float32_t
const * ptr
, float32x2x3_t
* src
,__constrange(0,1) int lane
) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
10886 v
.val
[0] = vld1_lane_f32(ptr
, src
->val
[0], lane
);
10887 v
.val
[1] = vld1_lane_f32((ptr
+ 1), src
->val
[1], lane
);
10888 v
.val
[2] = vld1_lane_f32((ptr
+ 2), src
->val
[2], lane
);
10891 #define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
10893 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
10894 #define vld3_lane_p8 vld3_lane_u8
10896 //poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
10897 #define vld3_lane_p16 vld3_lane_u16
10899 //******************* Lane Quadruples load ***************************
10900 //*********************************************************************
10901 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
10902 //we assume src is 16 bit aligned
10904 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10905 _NEON2SSE_INLINE uint16x8x4_t
vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr
, uint16x8x4_t
* src
,__constrange(0,7) int lane
)
10908 v
.val
[0] = _MM_INSERT_EPI16 ( src
->val
[0], ptr
[0], lane
);
10909 v
.val
[1] = _MM_INSERT_EPI16 ( src
->val
[1], ptr
[1], lane
);
10910 v
.val
[2] = _MM_INSERT_EPI16 ( src
->val
[2], ptr
[2], lane
);
10911 v
.val
[3] = _MM_INSERT_EPI16 ( src
->val
[3], ptr
[3], lane
);
10914 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
10916 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10917 _NEON2SSE_INLINE uint32x4x4_t
vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr
, uint32x4x4_t
* src
,__constrange(0,3) int lane
)
10920 v
.val
[0] = _MM_INSERT_EPI32 ( src
->val
[0], ptr
[0], lane
);
10921 v
.val
[1] = _MM_INSERT_EPI32 ( src
->val
[1], ptr
[1], lane
);
10922 v
.val
[2] = _MM_INSERT_EPI32 ( src
->val
[2], ptr
[2], lane
);
10923 v
.val
[3] = _MM_INSERT_EPI32 ( src
->val
[3], ptr
[3], lane
);
10926 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
10928 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10929 int16x8x4_t
vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr
, int16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10930 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
10932 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10933 int32x4x4_t
vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr
, int32x4x4_t
* src
, __constrange(0,3) int lane
); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10934 #define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane)
10936 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10937 float16x8x4_t
vld4q_lane_f16_ptr(__transfersize(4) __fp16
const * ptr
, float16x8x4_t
* src
, __constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10938 //current IA SIMD doesn't support float16
10940 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10941 _NEON2SSE_INLINE float32x4x4_t
vld4q_lane_f32_ptr(__transfersize(4) float32_t
const * ptr
, float32x4x4_t
* src
,__constrange(0,3) int lane
)
10944 v
.val
[0] = vld1q_lane_f32(&ptr
[0], src
->val
[0], lane
);
10945 v
.val
[1] = vld1q_lane_f32(&ptr
[1], src
->val
[1], lane
);
10946 v
.val
[2] = vld1q_lane_f32(&ptr
[2], src
->val
[2], lane
);
10947 v
.val
[3] = vld1q_lane_f32(&ptr
[3], src
->val
[3], lane
);
10950 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
10952 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10953 poly16x8x4_t
vld4q_lane_p16_ptr(__transfersize(4) poly16_t
const * ptr
, poly16x8x4_t
* src
,__constrange(0,7) int lane
); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
10954 #define vld4q_lane_p16 vld4q_lane_u16
10956 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10957 _NEON2SSE_INLINE uint8x8x4_t
vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr
, uint8x8x4_t
* src
, __constrange(0,7) int lane
)
10960 v
.val
[0] = vld1_lane_u8(ptr
, src
->val
[0], lane
);
10961 v
.val
[1] = vld1_lane_u8((ptr
+ 1), src
->val
[1], lane
);
10962 v
.val
[2] = vld1_lane_u8((ptr
+ 2), src
->val
[2], lane
);
10963 v
.val
[3] = vld1_lane_u8((ptr
+ 3), src
->val
[3], lane
);
10966 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
10968 //uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10969 _NEON2SSE_INLINE uint16x4x4_t
vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr
, uint16x4x4_t
* src
, __constrange(0,3) int lane
)
10972 v
.val
[0] = vld1_lane_u16(ptr
, src
->val
[0], lane
);
10973 v
.val
[1] = vld1_lane_u16((ptr
+ 1), src
->val
[1], lane
);
10974 v
.val
[2] = vld1_lane_u16((ptr
+ 2), src
->val
[2], lane
);
10975 v
.val
[3] = vld1_lane_u16((ptr
+ 3), src
->val
[3], lane
);
10978 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
10980 //uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10981 _NEON2SSE_INLINE uint32x2x4_t
vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr
, uint32x2x4_t
* src
, __constrange(0,1) int lane
)
10984 v
.val
[0] = vld1_lane_u32(ptr
, src
->val
[0], lane
);
10985 v
.val
[1] = vld1_lane_u32((ptr
+ 1), src
->val
[1], lane
);
10986 v
.val
[2] = vld1_lane_u32((ptr
+ 2), src
->val
[2], lane
);
10987 v
.val
[3] = vld1_lane_u32((ptr
+ 3), src
->val
[3], lane
);
10990 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
10992 //int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10993 int8x8x4_t
vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr
, int8x8x4_t
* src
, __constrange(0,7) int lane
);
10994 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
10996 //int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
10997 int16x4x4_t
vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr
, int16x4x4_t
* src
, __constrange(0,3) int lane
);
10998 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
11000 //int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11001 int32x2x4_t
vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr
, int32x2x4_t
* src
, __constrange(0,1) int lane
);
11002 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
11004 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11005 float16x4x4_t
vld4_lane_f16_ptr(__transfersize(4) __fp16
const * ptr
, float16x4x4_t
* src
, __constrange(0,3) int lane
);
11006 //current IA SIMD doesn't support float16
11008 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11009 _NEON2SSE_INLINE float32x2x4_t
vld4_lane_f32_ptr(__transfersize(4) float32_t
const * ptr
, float32x2x4_t
* src
,__constrange(0,1) int lane
)
11011 //serial solution may be faster
11013 v
.val
[0] = vld1_lane_f32(ptr
, src
->val
[0], lane
);
11014 v
.val
[1] = vld1_lane_f32((ptr
+ 1), src
->val
[1], lane
);
11015 v
.val
[2] = vld1_lane_f32((ptr
+ 2), src
->val
[2], lane
);
11016 v
.val
[3] = vld1_lane_f32((ptr
+ 3), src
->val
[3], lane
);
11019 #define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
11021 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11022 poly8x8x4_t
vld4_lane_p8_ptr(__transfersize(4) poly8_t
const * ptr
, poly8x8x4_t
* src
, __constrange(0,7) int lane
);
11023 #define vld4_lane_p8 vld4_lane_u8
11025 //poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11026 poly16x4x4_t
vld4_lane_p16_ptr(__transfersize(4) poly16_t
const * ptr
, poly16x4x4_t
* src
, __constrange(0,3) int lane
);
11027 #define vld4_lane_p16 vld4_lane_u16
11029 //******************* Store duplets *********************************************
11030 //********************************************************************************
11031 //here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
11032 //If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
11033 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
11034 _NEON2SSE_INLINE
void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr
, uint8x16x2_t
* val
)
11037 v
.val
[0] = _mm_unpacklo_epi8(val
->val
[0], val
->val
[1]);
11038 v
.val
[1] = _mm_unpackhi_epi8(val
->val
[0], val
->val
[1]);
11039 vst1q_u8 (ptr
, v
.val
[0]);
11040 vst1q_u8 ((ptr
+ 16), v
.val
[1]);
11042 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
11044 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
11045 _NEON2SSE_INLINE
void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr
, uint16x8x2_t
* val
)
11048 v
.val
[0] = _mm_unpacklo_epi16(val
->val
[0], val
->val
[1]);
11049 v
.val
[1] = _mm_unpackhi_epi16(val
->val
[0], val
->val
[1]);
11050 vst1q_u16 (ptr
, v
.val
[0]);
11051 vst1q_u16 ((ptr
+ 8), v
.val
[1]);
11053 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
11055 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11056 _NEON2SSE_INLINE
void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr
, uint32x4x2_t
* val
)
11059 v
.val
[0] = _mm_unpacklo_epi32(val
->val
[0], val
->val
[1]);
11060 v
.val
[1] = _mm_unpackhi_epi32(val
->val
[0], val
->val
[1]);
11061 vst1q_u32 (ptr
, v
.val
[0]);
11062 vst1q_u32 ((ptr
+ 4), v
.val
[1]);
11064 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
11066 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
11067 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr
, int8x16x2_t
* val
);
11068 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
11070 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11071 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr
, int16x8x2_t
* val
);
11072 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
11074 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
11075 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr
, int32x4x2_t
* val
);
11076 #define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val)
11078 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11079 void vst2q_f16_ptr(__transfersize(16) __fp16
* ptr
, float16x8x2_t
* val
);
11080 // IA32 SIMD doesn't work with 16bit floats currently
11082 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
11083 _NEON2SSE_INLINE
void vst2q_f32_ptr(__transfersize(8) float32_t
* ptr
, float32x4x2_t
* val
)
11086 v
.val
[0] = _mm_unpacklo_ps(val
->val
[0], val
->val
[1]);
11087 v
.val
[1] = _mm_unpackhi_ps(val
->val
[0], val
->val
[1]);
11088 vst1q_f32 (ptr
, v
.val
[0]);
11089 vst1q_f32 ((ptr
+ 4), v
.val
[1]);
11091 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
11093 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
11094 void vst2q_p8_ptr(__transfersize(32) poly8_t
* ptr
, poly8x16x2_t
* val
);
11095 #define vst2q_p8 vst2q_u8
11097 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
11098 void vst2q_p16_ptr(__transfersize(16) poly16_t
* ptr
, poly16x8x2_t
* val
);
11099 #define vst2q_p16 vst2q_u16
11101 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11102 _NEON2SSE_INLINE
void vst2_u8_ptr(__transfersize(16) uint8_t * ptr
, uint8x8x2_t
* val
)
11105 v0
= _mm_unpacklo_epi8(_pM128i(val
->val
[0]), _pM128i(val
->val
[1]));
11106 vst1q_u8 (ptr
, v0
);
11108 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
11110 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
11111 _NEON2SSE_INLINE
void vst2_u16_ptr(__transfersize(8) uint16_t * ptr
, uint16x4x2_t
* val
)
11114 v0
= _mm_unpacklo_epi16(_pM128i(val
->val
[0]), _pM128i(val
->val
[1]));
11115 vst1q_u16 (ptr
, v0
);
11117 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
11119 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
11120 _NEON2SSE_INLINE
void vst2_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x2x2_t
* val
)
11123 v0
= _mm_unpacklo_epi32(_pM128i(val
->val
[0]), _pM128i(val
->val
[1]));
11124 vst1q_u32 (ptr
, v0
);
11126 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
11129 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
11130 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr
, uint64x1x2_t
* val
);
11131 _NEON2SSE_INLINE
void vst2_u64_ptr(__transfersize(2) uint64_t * ptr
, uint64x1x2_t
* val
)
11133 *(ptr
) = val
->val
[0].m64_u64
[0];
11134 *(ptr
+ 1) = val
->val
[1].m64_u64
[0];
11136 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
11138 //void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
11139 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
11141 //void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11142 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
11144 //void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11145 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
11147 //void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
11148 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
11150 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
11151 //current IA SIMD doesn't support float16
11153 //void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
11154 _NEON2SSE_INLINE
void vst2_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x2x2_t
* val
)
11156 *(ptr
) = val
->val
[0].m64_f32
[0];
11157 *(ptr
+ 1) = val
->val
[1].m64_f32
[0];
11158 *(ptr
+ 2) = val
->val
[0].m64_f32
[1];
11159 *(ptr
+ 3) = val
->val
[1].m64_f32
[1];
11161 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
11163 //void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
11164 #define vst2_p8 vst2_u8
11166 //void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
11167 #define vst2_p16 vst2_u16
11169 //******************** Triplets store *****************************************
11170 //******************************************************************************
11171 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
11172 _NEON2SSE_INLINE
void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr
, uint8x16x3_t
* val
)
11175 __m128i v0
,v1
,v2
, cff
, bldmask
;
11176 _NEON2SSE_ALIGN_16
uint8_t mask0
[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
11177 _NEON2SSE_ALIGN_16
uint8_t mask1
[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
11178 _NEON2SSE_ALIGN_16
uint8_t mask2
[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
11179 _NEON2SSE_ALIGN_16
uint8_t mask2lo
[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
11180 _NEON2SSE_ALIGN_16
uint8_t mask2med
[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
11181 _NEON2SSE_ALIGN_16
uint8_t mask2hi
[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
11183 v0
= _mm_unpacklo_epi8(val
->val
[0], val
->val
[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
11184 v2
= _mm_unpackhi_epi8(val
->val
[0], val
->val
[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
11185 v1
= _mm_alignr_epi8(v2
, v0
, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34
11186 v
.val
[0] = _mm_shuffle_epi8(v0
, *(__m128i
*)mask0
); //make holes for the v.val[2] data embedding
11187 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2lo
); //make plugs for the v.val[2] data embedding
11188 cff
= _mm_cmpeq_epi8(v0
, v0
); //all ff
11189 bldmask
= _mm_cmpeq_epi8(*(__m128i
*)mask0
, cff
);
11190 v
.val
[0] = _MM_BLENDV_EPI8(v
.val
[0], v
.val
[2], bldmask
);
11191 vst1q_u8(ptr
, v
.val
[0]);
11192 v
.val
[0] = _mm_shuffle_epi8(v1
, *(__m128i
*)mask1
); //make holes for the v.val[2] data embedding
11193 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2med
); //make plugs for the v.val[2] data embedding
11194 bldmask
= _mm_cmpeq_epi8(*(__m128i
*)mask1
, cff
);
11195 v
.val
[1] = _MM_BLENDV_EPI8(v
.val
[0],v
.val
[2], bldmask
);
11196 vst1q_u8((ptr
+ 16), v
.val
[1]);
11197 v
.val
[0] = _mm_shuffle_epi8(v2
, *(__m128i
*)mask2
); //make holes for the v.val[2] data embedding
11198 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2hi
); //make plugs for the v.val[2] data embedding
11199 bldmask
= _mm_cmpeq_epi8(*(__m128i
*)mask2
, cff
);
11200 v
.val
[2] = _MM_BLENDV_EPI8(v
.val
[0],v
.val
[2], bldmask
);
11201 vst1q_u8((ptr
+ 32), v
.val
[2]);
11203 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
11205 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
11206 _NEON2SSE_INLINE
void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr
, uint16x8x3_t
* val
)
11209 __m128i v0
,v1
,v2
, cff
, bldmask
;
11210 _NEON2SSE_ALIGN_16
uint8_t mask0
[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
11211 _NEON2SSE_ALIGN_16
uint8_t mask1
[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
11212 _NEON2SSE_ALIGN_16
uint8_t mask2
[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
11213 _NEON2SSE_ALIGN_16
uint8_t mask2lo
[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
11214 _NEON2SSE_ALIGN_16
uint8_t mask2med
[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
11215 _NEON2SSE_ALIGN_16
uint8_t mask2hi
[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
11217 v0
= _mm_unpacklo_epi16(val
->val
[0], val
->val
[1]); //0,1, 3,4, 6,7, 9,10
11218 v2
= _mm_unpackhi_epi16(val
->val
[0], val
->val
[1]); //12,13, 15,16, 18,19, 21,22,
11219 v1
= _mm_alignr_epi8(v2
, v0
, 12); //9,10, 12,13, 15,16, 18,19
11220 v
.val
[0] = _mm_shuffle_epi8(v0
, *(__m128i
*)mask0
); //make holes for the v.val[2] data embedding
11221 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2lo
); //make plugs for the v.val[2] data embedding
11222 cff
= _mm_cmpeq_epi16(v0
, v0
); //all ff
11223 bldmask
= _mm_cmpeq_epi16(*(__m128i
*)mask0
, cff
);
11224 v
.val
[0] = _MM_BLENDV_EPI8(v
.val
[0], v
.val
[2], bldmask
);
11225 vst1q_u16(ptr
, v
.val
[0]);
11226 v
.val
[0] = _mm_shuffle_epi8(v1
, *(__m128i
*)mask1
); //make holes for the v.val[2] data embedding
11227 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2med
); //make plugs for the v.val[2] data embedding
11228 bldmask
= _mm_cmpeq_epi16(*(__m128i
*)mask1
, cff
);
11229 v
.val
[1] = _MM_BLENDV_EPI8(v
.val
[0],v
.val
[2], bldmask
);
11230 vst1q_u16((ptr
+ 8), v
.val
[1]);
11231 v
.val
[0] = _mm_shuffle_epi8(v2
, *(__m128i
*)mask2
); //make holes for the v.val[2] data embedding
11232 v
.val
[2] = _mm_shuffle_epi8(val
->val
[2], *(__m128i
*)mask2hi
); //make plugs for the v.val[2] data embedding
11233 bldmask
= _mm_cmpeq_epi16(*(__m128i
*)mask2
, cff
);
11234 v
.val
[2] = _MM_BLENDV_EPI8(v
.val
[0],v
.val
[2], bldmask
);
11235 vst1q_u16((ptr
+ 16), v
.val
[2]);
11237 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
11239 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11240 _NEON2SSE_INLINE
void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr
, uint32x4x3_t
* val
)
11242 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
11244 __m128i tmp0
, tmp1
,tmp2
;
11245 tmp0
= _mm_unpacklo_epi32(val
->val
[0], val
->val
[1]); //a0,b0,a1,b1
11246 tmp1
= _mm_unpackhi_epi32(val
->val
[0], val
->val
[1]); //a2,b2,a3,b3
11247 tmp2
= _mm_unpacklo_epi32(val
->val
[1], val
->val
[2]); //b0,c0,b1,c1
11248 v
.val
[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2
),_mm_castsi128_ps(tmp1
), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
11249 v
.val
[2] = _mm_unpackhi_epi64(tmp1
, val
->val
[2]); //a3,b3, c2,c3
11250 v
.val
[2] = _mm_shuffle_epi32(v
.val
[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
11251 tmp1
= _mm_unpacklo_epi32(tmp2
,val
->val
[0]); //b0,a0,c0,a1
11252 v
.val
[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0
),_mm_castsi128_ps(tmp1
), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
11254 vst1q_u32(ptr
, v
.val
[0]);
11255 vst1q_u32((ptr
+ 4), v
.val
[1]);
11256 vst1q_u32((ptr
+ 8), v
.val
[2]);
11258 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
11260 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
11261 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr
, int8x16x3_t
* val
);
11262 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
11264 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
11265 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr
, int16x8x3_t
* val
);
11266 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
11268 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
11269 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr
, int32x4x3_t
* val
);
11270 #define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val)
11272 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11273 void vst3q_f16_ptr(__transfersize(24) __fp16
* ptr
, float16x8x3_t
* val
);
11274 // IA32 SIMD doesn't work with 16bit floats currently
11276 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
11277 _NEON2SSE_INLINE
void vst3q_f32_ptr(__transfersize(12) float32_t
* ptr
, float32x4x3_t
* val
)
11280 __m128 tmp0
, tmp1
,tmp2
;
11281 tmp0
= _mm_unpacklo_ps(val
->val
[0], val
->val
[1]); //a0,b0,a1,b1
11282 tmp1
= _mm_unpackhi_ps(val
->val
[0], val
->val
[1]); //a2,b2,a3,b3
11283 tmp2
= _mm_unpacklo_ps(val
->val
[1], val
->val
[2]); //b0,c0,b1,c1
11284 v
.val
[1] = _mm_shuffle_ps(tmp2
,tmp1
, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
11285 v
.val
[2] = _mm_movehl_ps(val
->val
[2],tmp1
); //a3,b3, c2,c3
11286 v
.val
[2] = _mm_shuffle_ps(v
.val
[2],v
.val
[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
11287 tmp1
= _mm_unpacklo_ps(tmp2
,val
->val
[0]); //b0,a0,c0,a1
11288 v
.val
[0] = _mm_shuffle_ps(tmp0
,tmp1
, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
11290 vst1q_f32( ptr
, v
.val
[0]);
11291 vst1q_f32( (ptr
+ 4), v
.val
[1]);
11292 vst1q_f32( (ptr
+ 8), v
.val
[2]);
11294 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
11296 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
11297 void vst3q_p8_ptr(__transfersize(48) poly8_t
* ptr
, poly8x16x3_t
* val
);
11298 #define vst3q_p8 vst3q_u8
11300 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
11301 void vst3q_p16_ptr(__transfersize(24) poly16_t
* ptr
, poly16x8x3_t
* val
);
11302 #define vst3q_p16 vst3q_u16
11304 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
11305 _NEON2SSE_INLINE
void vst3_u8_ptr(__transfersize(24) uint8_t * ptr
, uint8x8x3_t
* val
)
11307 __m128i tmp
, sh0
, sh1
, val0
, val2
;
11308 _NEON2SSE_ALIGN_16
int8_t mask0
[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
11309 _NEON2SSE_ALIGN_16
int8_t mask1
[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
11310 _NEON2SSE_ALIGN_16
int8_t mask0_sel
[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
11311 _NEON2SSE_ALIGN_16
int8_t mask1_sel
[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
11312 tmp
= _mm_unpacklo_epi64(_pM128i(val
->val
[0]), _pM128i(val
->val
[1]) );
11313 sh0
= _mm_shuffle_epi8(tmp
, *(__m128i
*)mask0
); //for bi>15 bi is wrapped (bi-=15)
11314 val2
= _pM128i(val
->val
[2]);
11315 sh1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask0
);
11316 val0
= _MM_BLENDV_EPI8(sh0
, sh1
, *(__m128i
*)mask0_sel
);
11317 vst1q_u8(ptr
, val0
); //store as 128 bit structure
11318 sh0
= _mm_shuffle_epi8(tmp
, *(__m128i
*)mask1
); //for bi>15 bi is wrapped (bi-=15)
11319 sh1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask1
);
11320 val2
= _MM_BLENDV_EPI8(sh0
, sh1
, *(__m128i
*)mask1_sel
);
11321 _M64((*(__m64_128
*)(ptr
+ 16)), val2
); //need it to fit into *ptr memory
11323 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
11325 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
11326 _NEON2SSE_INLINE
void vst3_u16_ptr(__transfersize(12) uint16_t * ptr
, uint16x4x3_t
* val
)
11328 __m128i tmp
, val0
, val1
, val2
;
11329 _NEON2SSE_ALIGN_16
int8_t mask0
[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
11330 _NEON2SSE_ALIGN_16
int8_t mask1
[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0};
11331 _NEON2SSE_ALIGN_16
uint16_t mask0f
[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1]
11332 _NEON2SSE_ALIGN_16
uint16_t mask1f
[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0]
11333 tmp
= _mm_unpacklo_epi64(_pM128i(val
->val
[0]), _pM128i(val
->val
[1]));
11334 val0
= _mm_shuffle_epi8(tmp
, *(__m128i
*)mask0
);
11335 val2
= _pM128i(val
->val
[2]);
11336 val1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask0
);
11337 val0
= _MM_BLENDV_EPI8(val1
, val0
, *(__m128i
*)mask0f
);
11338 vst1q_u16(ptr
, val0
); //store as 128 bit structure
11339 val0
= _mm_shuffle_epi8(tmp
, *(__m128i
*)mask1
);
11340 val1
= _mm_shuffle_epi8(val2
, *(__m128i
*)mask1
);
11341 val1
= _MM_BLENDV_EPI8(val0
, val1
, *(__m128i
*)mask1f
); //change the operands order
11342 _M64((*(__m64_128
*)(ptr
+ 8)), val1
); //need it to fit into *ptr memory
11344 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
11346 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
11347 _NEON2SSE_INLINE
void vst3_u32_ptr(__transfersize(6) uint32_t * ptr
, uint32x2x3_t
* val
)
11349 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
11350 __m128i val0
, val1
;
11351 val0
= _mm_unpacklo_epi64(_pM128i(val
->val
[1]), _pM128i(val
->val
[2])); //val[0]: 1,4,2,5
11352 val0
= _mm_shuffle_epi32(val0
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
11353 val1
= _mm_srli_si128(val0
, 8); //4,5, x,x
11354 _M64((*(__m64_128
*)(ptr
+ 4)), val1
);
11355 val0
= _mm_unpacklo_epi32(_pM128i(val
->val
[0]), val0
); //0,1,3,2
11356 val0
= _mm_shuffle_epi32(val0
, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
11357 vst1q_u32(ptr
, val0
); //store as 128 bit structure
11359 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
11361 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
11362 _NEON2SSE_INLINE
void vst3_u64_ptr(__transfersize(3) uint64_t * ptr
, uint64x1x3_t
* val
)
11364 *(ptr
) = val
->val
[0].m64_u64
[0];
11365 *(ptr
+ 1) = val
->val
[1].m64_u64
[0];
11366 *(ptr
+ 2) = val
->val
[2].m64_u64
[0];
11368 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
11370 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0]
11371 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
11373 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0]
11374 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
11376 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
11377 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
11379 //void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
11380 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
11382 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11383 void vst3_f16_ptr(__transfersize(12) __fp16
* ptr
, float16x4x3_t
* val
); // VST3.16 {d0, d1, d2}, [r0]
11384 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11386 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
11387 _NEON2SSE_INLINE
void vst3_f32_ptr(__transfersize(6) float32_t
* ptr
, float32x2x3_t
* val
)
11389 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5
11390 *(ptr
) = val
->val
[0].m64_f32
[0];
11391 *(ptr
+ 1) = val
->val
[1].m64_f32
[0];
11392 *(ptr
+ 2) = val
->val
[2].m64_f32
[0];
11393 *(ptr
+ 3) = val
->val
[0].m64_f32
[1];
11394 *(ptr
+ 4) = val
->val
[1].m64_f32
[1];
11395 *(ptr
+ 5) = val
->val
[2].m64_f32
[1];
11397 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
11399 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
11400 void vst3_p8_ptr(__transfersize(24) poly8_t
* ptr
, poly8x8x3_t
* val
);
11401 #define vst3_p8 vst3_u8
11403 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
11404 void vst3_p16_ptr(__transfersize(12) poly16_t
* ptr
, poly16x4x3_t
* val
);
11405 #define vst3_p16 vst3_s16
11407 //*************** Quadruples store ********************************
11408 //*********************************************************************
11409 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
11410 _NEON2SSE_INLINE
void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr
, uint8x16x4_t
* val
)
11412 __m128i tmp1
, tmp2
, res
;
11413 tmp1
= _mm_unpacklo_epi8(val
->val
[0], val
->val
[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
11414 tmp2
= _mm_unpacklo_epi8(val
->val
[2], val
->val
[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
11415 res
= _mm_unpacklo_epi16(tmp1
, tmp2
); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
11416 vst1q_u8(ptr
, res
);
11417 res
= _mm_unpackhi_epi16(tmp1
, tmp2
); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
11418 vst1q_u8((ptr
+ 16), res
);
11419 tmp1
= _mm_unpackhi_epi8(val
->val
[0], val
->val
[1]); //
11420 tmp2
= _mm_unpackhi_epi8(val
->val
[2], val
->val
[3]); //
11421 res
= _mm_unpacklo_epi16(tmp1
, tmp2
); //
11422 vst1q_u8((ptr
+ 32), res
);
11423 res
= _mm_unpackhi_epi16(tmp1
, tmp2
); //
11424 vst1q_u8((ptr
+ 48), res
);
11426 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
11428 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
11429 _NEON2SSE_INLINE
void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr
, uint16x8x4_t
* val
)
11432 __m128i tmp1
, tmp2
;
11433 tmp1
= _mm_unpacklo_epi16(val
->val
[0], val
->val
[1]); //0,1, 4,5, 8,9, 12,13
11434 tmp2
= _mm_unpacklo_epi16(val
->val
[2], val
->val
[3]); //2,3, 6,7 , 10,11, 14,15
11435 v
.val
[0] = _mm_unpacklo_epi32(tmp1
, tmp2
);
11436 v
.val
[1] = _mm_unpackhi_epi32(tmp1
, tmp2
);
11437 tmp1
= _mm_unpackhi_epi16(val
->val
[0], val
->val
[1]); //0,1, 4,5, 8,9, 12,13
11438 tmp2
= _mm_unpackhi_epi16(val
->val
[2], val
->val
[3]); //2,3, 6,7 , 10,11, 14,15
11439 v
.val
[2] = _mm_unpacklo_epi32(tmp1
, tmp2
);
11440 v
.val
[3] = _mm_unpackhi_epi32(tmp1
, tmp2
);
11441 vst1q_u16(ptr
, v
.val
[0]);
11442 vst1q_u16((ptr
+ 8), v
.val
[1]);
11443 vst1q_u16((ptr
+ 16),v
.val
[2]);
11444 vst1q_u16((ptr
+ 24), v
.val
[3]);
11446 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
11448 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11449 _NEON2SSE_INLINE
void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr
, uint32x4x4_t
* val
)
11452 __m128i tmp1
, tmp2
;
11453 tmp1
= _mm_unpacklo_epi32(val
->val
[0], val
->val
[1]); //0,1, 4,5, 8,9, 12,13
11454 tmp2
= _mm_unpacklo_epi32(val
->val
[2], val
->val
[3]); //2,3, 6,7 , 10,11, 14,15
11455 v
.val
[0] = _mm_unpacklo_epi64(tmp1
, tmp2
);
11456 v
.val
[1] = _mm_unpackhi_epi64(tmp1
, tmp2
);
11457 tmp1
= _mm_unpackhi_epi32(val
->val
[0], val
->val
[1]); //0,1, 4,5, 8,9, 12,13
11458 tmp2
= _mm_unpackhi_epi32(val
->val
[2], val
->val
[3]); //2,3, 6,7 , 10,11, 14,15
11459 v
.val
[2] = _mm_unpacklo_epi64(tmp1
, tmp2
);
11460 v
.val
[3] = _mm_unpackhi_epi64(tmp1
, tmp2
);
11461 vst1q_u32(ptr
, v
.val
[0]);
11462 vst1q_u32((ptr
+ 4), v
.val
[1]);
11463 vst1q_u32((ptr
+ 8), v
.val
[2]);
11464 vst1q_u32((ptr
+ 12), v
.val
[3]);
11466 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
11468 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
11469 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr
, int8x16x4_t
* val
);
11470 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
11472 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
11473 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr
, int16x8x4_t
* val
);
11474 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
11476 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
11477 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr
, int32x4x4_t
* val
);
11478 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
11480 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11481 void vst4q_f16_ptr(__transfersize(32) __fp16
* ptr
, float16x8x4_t
* val
);
11482 // IA32 SIMD doesn't work with 16bit floats currently
11484 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
11485 _NEON2SSE_INLINE
void vst4q_f32_ptr(__transfersize(16) float32_t
* ptr
, float32x4x4_t
* val
)
11487 __m128 tmp3
, tmp2
, tmp1
, tmp0
;
11489 tmp0
= _mm_unpacklo_ps(val
->val
[0], val
->val
[1]);
11490 tmp2
= _mm_unpacklo_ps(val
->val
[2], val
->val
[3]);
11491 tmp1
= _mm_unpackhi_ps(val
->val
[0], val
->val
[1]);
11492 tmp3
= _mm_unpackhi_ps(val
->val
[2], val
->val
[3]);
11493 v
.val
[0] = _mm_movelh_ps(tmp0
, tmp2
);
11494 v
.val
[1] = _mm_movehl_ps(tmp2
, tmp0
);
11495 v
.val
[2] = _mm_movelh_ps(tmp1
, tmp3
);
11496 v
.val
[3] = _mm_movehl_ps(tmp3
, tmp1
);
11497 vst1q_f32(ptr
, v
.val
[0]);
11498 vst1q_f32((ptr
+ 4), v
.val
[1]);
11499 vst1q_f32((ptr
+ 8), v
.val
[2]);
11500 vst1q_f32((ptr
+ 12), v
.val
[3]);
11502 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
11504 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
11505 void vst4q_p8_ptr(__transfersize(64) poly8_t
* ptr
, poly8x16x4_t
* val
);
11506 #define vst4q_p8 vst4q_u8
11508 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
11509 void vst4q_p16_ptr(__transfersize(32) poly16_t
* ptr
, poly16x8x4_t
* val
);
11510 #define vst4q_p16 vst4q_s16
11512 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
11513 _NEON2SSE_INLINE
void vst4_u8_ptr(__transfersize(32) uint8_t * ptr
, uint8x8x4_t
* val
)
11515 __m128i sh0
, sh1
, val0
, val2
;
11516 sh0
= _mm_unpacklo_epi8(_pM128i(val
->val
[0]),_pM128i(val
->val
[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
11517 sh1
= _mm_unpacklo_epi8(_pM128i(val
->val
[2]),_pM128i(val
->val
[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
11518 val0
= _mm_unpacklo_epi16(sh0
,sh1
); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
11519 val2
= _mm_unpackhi_epi16(sh0
,sh1
); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
11520 vst1q_u8(ptr
, val0
);
11521 vst1q_u8((ptr
+ 16), val2
);
11523 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
11525 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
11526 _NEON2SSE_INLINE
void vst4_u16_ptr(__transfersize(16) uint16_t * ptr
, uint16x4x4_t
* val
)
11528 __m128i sh0
, sh1
, val0
, val2
;
11529 sh0
= _mm_unpacklo_epi16(_pM128i(val
->val
[0]),_pM128i(val
->val
[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
11530 sh1
= _mm_unpacklo_epi16(_pM128i(val
->val
[2]),_pM128i(val
->val
[3])); //a2,a3,b2,b3,c2,c3,d2,d3
11531 val0
= _mm_unpacklo_epi32(sh0
,sh1
); // a0,a1,a2,a3,b0,b1,b2,b3
11532 val2
= _mm_unpackhi_epi32(sh0
,sh1
); // c0,c1,c2,c3,d0,d1,d2,d3
11533 vst1q_u16(ptr
, val0
); //store as 128 bit structure
11534 vst1q_u16((ptr
+ 8), val2
);
11536 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
11538 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
11539 _NEON2SSE_INLINE
void vst4_u32_ptr(__transfersize(8) uint32_t * ptr
, uint32x2x4_t
* val
)
11541 //0,4, 1,5, 2,6, 3,7
11542 __m128i sh0
, sh1
, val0
, val1
;
11543 sh0
= _mm_unpacklo_epi32(_pM128i(val
->val
[0]), _pM128i(val
->val
[1])); //0,1,4,5
11544 sh1
= _mm_unpacklo_epi32(_pM128i(val
->val
[2]), _pM128i(val
->val
[3])); //2,3,6,7
11545 val0
= _mm_unpacklo_epi64(sh0
,sh1
); //
11546 val1
= _mm_unpackhi_epi64(sh0
,sh1
); //
11547 vst1q_u32(ptr
, val0
); //store as 128 bit structure
11548 vst1q_u32((ptr
+ 4), val1
);
11550 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
11552 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
11553 _NEON2SSE_INLINE
void vst4_u64_ptr(__transfersize(4) uint64_t * ptr
, uint64x1x4_t
* val
)
11555 *(ptr
) = val
->val
[0].m64_u64
[0];
11556 *(ptr
+ 1) = val
->val
[1].m64_u64
[0];
11557 *(ptr
+ 2) = val
->val
[2].m64_u64
[0];
11558 *(ptr
+ 3) = val
->val
[3].m64_u64
[0];
11560 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
11562 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0]
11563 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
11565 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0]
11566 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
11568 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
11569 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
11571 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
11572 void vst4_s64_ptr(__transfersize(4) int64_t * ptr
, int64x1x4_t
* val
);
11573 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
11575 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11576 void vst4_f16_ptr(__transfersize(16) __fp16
* ptr
, float16x4x4_t
* val
);
11577 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
11579 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
11580 _NEON2SSE_INLINE
void vst4_f32_ptr(__transfersize(8) float32_t
* ptr
, float32x2x4_t
* val
)
11582 //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7
11583 *(ptr
) = val
->val
[0].m64_f32
[0];
11584 *(ptr
+ 1) = val
->val
[1].m64_f32
[0];
11585 *(ptr
+ 2) = val
->val
[2].m64_f32
[0];
11586 *(ptr
+ 3) = val
->val
[3].m64_f32
[0];
11587 *(ptr
+ 4) = val
->val
[0].m64_f32
[1];
11588 *(ptr
+ 5) = val
->val
[1].m64_f32
[1];
11589 *(ptr
+ 6) = val
->val
[2].m64_f32
[1];
11590 *(ptr
+ 7) = val
->val
[3].m64_f32
[1];
11592 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
11594 //void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
11595 void vst4_p8_ptr(__transfersize(32) poly8_t
* ptr
, poly8x8x4_t
* val
);
11596 #define vst4_p8 vst4_u8
11598 //void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
11599 void vst4_p16_ptr(__transfersize(16) poly16_t
* ptr
, poly16x4x4_t
* val
);
11600 #define vst4_p16 vst4_u16
11602 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors *********************
11603 //********************************************************************************************************************
11604 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
11605 _NEON2SSE_INLINE
void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr
, uint16x8x2_t
* val
, __constrange(0,7) int lane
)
11607 vst1q_lane_s16(ptr
, val
->val
[0], lane
);
11608 vst1q_lane_s16((ptr
+ 1), val
->val
[1], lane
);
11610 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
11612 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11613 _NEON2SSE_INLINE
void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr
, uint32x4x2_t
* val
, __constrange(0,3) int lane
)
11615 vst1q_lane_u32(ptr
, val
->val
[0], lane
);
11616 vst1q_lane_u32((ptr
+ 1), val
->val
[1], lane
);
11618 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
11620 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11621 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr
, int16x8x2_t
* val
, __constrange(0,7) int lane
);
11622 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
11624 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
11625 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr
, int32x4x2_t
* val
, __constrange(0,3) int lane
);
11626 #define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane)
11628 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11629 void vst2q_lane_f16_ptr(__transfersize(2) __fp16
* ptr
, float16x8x2_t
* val
, __constrange(0,7) int lane
);
11630 //current IA SIMD doesn't support float16
11632 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
11633 _NEON2SSE_INLINE
void vst2q_lane_f32_ptr(__transfersize(2) float32_t
* ptr
, float32x4x2_t
* val
, __constrange(0,3) int lane
)
11635 vst1q_lane_f32(ptr
, val
->val
[0], lane
);
11636 vst1q_lane_f32((ptr
+ 1), val
->val
[1], lane
);
11638 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
11640 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
11641 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t
* ptr
, poly16x8x2_t
* val
, __constrange(0,7) int lane
);
11642 #define vst2q_lane_p16 vst2q_lane_s16
11644 //void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11645 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr
, uint8x8x2_t
* val
, __constrange(0,7) int lane
); // VST2.8 {d0[0], d1[0]}, [r0]
11646 _NEON2SSE_INLINE
void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr
, uint8x8x2_t
* val
, __constrange(0,7) int lane
) // VST2.8 {d0[0], d1[0]}, [r0]
11648 *(ptr
) = val
->val
[0].m64_u8
[lane
];
11649 *(ptr
+ 1) = val
->val
[1].m64_u8
[lane
];
11651 #define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
11653 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11654 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr
, uint16x4x2_t
* val
, __constrange(0,3) int lane
); // VST2.16 {d0[0], d1[0]}, [r0]
11655 _NEON2SSE_INLINE
void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr
, uint16x4x2_t
* val
, __constrange(0,3) int lane
)
11657 *(ptr
) = val
->val
[0].m64_u16
[lane
];
11658 *(ptr
+ 1) = val
->val
[1].m64_u16
[lane
];
11660 #define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
11662 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11663 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr
, uint32x2x2_t
* val
, __constrange(0,1) int lane
); // VST2.32 {d0[0], d1[0]}, [r0]
11664 _NEON2SSE_INLINE
void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr
, uint32x2x2_t
* val
, __constrange(0,1) int lane
)
11666 *(ptr
) = val
->val
[0].m64_u32
[lane
];
11667 *(ptr
+ 1) = val
->val
[1].m64_u32
[lane
];
11669 #define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
11671 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11672 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr
, int8x8x2_t
* val
, __constrange(0,7) int lane
);
11673 #define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane)
11675 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11676 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr
, int16x4x2_t
* val
, __constrange(0,3) int lane
);
11677 #define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane)
11679 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
11680 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr
, int32x2x2_t
* val
, __constrange(0,1) int lane
);
11681 #define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane)
11683 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
11684 //current IA SIMD doesn't support float16
11686 void vst2_lane_f32_ptr(__transfersize(2) float32_t
* ptr
, float32x2x2_t
* val
, __constrange(0,1) int lane
); // VST2.32 {d0[0], d1[0]}, [r0]
11687 _NEON2SSE_INLINE
void vst2_lane_f32_ptr(__transfersize(2) float32_t
* ptr
, float32x2x2_t
* val
, __constrange(0,1) int lane
)
11689 *(ptr
) = val
->val
[0].m64_f32
[lane
];
11690 *(ptr
+ 1) = val
->val
[1].m64_f32
[lane
];
11692 #define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
11694 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
11695 #define vst2_lane_p8 vst2_lane_u8
11697 //void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
11698 #define vst2_lane_p16 vst2_lane_u16
11700 //************************* Triple lanes stores *******************************************************
11701 //*******************************************************************************************************
11702 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11703 _NEON2SSE_INLINE
void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr
, uint16x8x3_t
* val
, __constrange(0,7) int lane
)
11705 vst2q_lane_u16_ptr(ptr
, (uint16x8x2_t
*)val
, lane
);
11706 vst1q_lane_u16((ptr
+ 2), val
->val
[2], lane
);
11708 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
11710 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11711 _NEON2SSE_INLINE
void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr
, uint32x4x3_t
* val
, __constrange(0,3) int lane
)
11713 vst2q_lane_u32_ptr(ptr
, (uint32x4x2_t
*)val
, lane
);
11714 vst1q_lane_u32((ptr
+ 2), val
->val
[2], lane
);
11716 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
11718 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11719 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr
, int16x8x3_t
* val
, __constrange(0,7) int lane
);
11720 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
11722 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11723 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr
, int32x4x3_t
* val
, __constrange(0,3) int lane
);
11724 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
11726 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11727 void vst3q_lane_f16_ptr(__transfersize(3) __fp16
* ptr
, float16x8x3_t
* val
, __constrange(0,7) int lane
);
11728 //current IA SIMD doesn't support float16
11730 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
11731 _NEON2SSE_INLINE
void vst3q_lane_f32_ptr(__transfersize(3) float32_t
* ptr
, float32x4x3_t
* val
, __constrange(0,3) int lane
)
11733 vst1q_lane_f32(ptr
, val
->val
[0], lane
);
11734 vst1q_lane_f32((ptr
+ 1), val
->val
[1], lane
);
11735 vst1q_lane_f32((ptr
+ 2), val
->val
[2], lane
);
11737 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
11739 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
11740 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t
* ptr
, poly16x8x3_t
* val
, __constrange(0,7) int lane
);
11741 #define vst3q_lane_p16 vst3q_lane_s16
11743 //void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11744 _NEON2SSE_INLINE
void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr
, uint8x8x3_t
* val
, __constrange(0,7) int lane
)
11746 *(ptr
) = val
->val
[0].m64_u8
[lane
];
11747 *(ptr
+ 1) = val
->val
[1].m64_u8
[lane
];
11748 *(ptr
+ 2) = val
->val
[2].m64_u8
[lane
];
11750 #define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
11752 //void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11753 _NEON2SSE_INLINE
void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr
, uint16x4x3_t
* val
, __constrange(0,3) int lane
)
11755 *(ptr
) = val
->val
[0].m64_u16
[lane
];
11756 *(ptr
+ 1) = val
->val
[1].m64_u16
[lane
];
11757 *(ptr
+ 2) = val
->val
[2].m64_u16
[lane
];
11759 #define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
11761 //void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11762 _NEON2SSE_INLINE
void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr
, uint32x2x3_t
* val
, __constrange(0,1) int lane
)
11764 *(ptr
) = val
->val
[0].m64_u32
[lane
];
11765 *(ptr
+ 1) = val
->val
[1].m64_u32
[lane
];
11766 *(ptr
+ 2) = val
->val
[2].m64_u32
[lane
];
11768 #define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
11770 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11771 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr
, int8x8x3_t
* val
, __constrange(0,7) int lane
);
11772 #define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
11774 //void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11775 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr
, int16x4x3_t
* val
, __constrange(0,3) int lane
);
11776 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
11778 //void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11779 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr
, int32x2x3_t
* val
, __constrange(0,1) int lane
);
11780 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
11782 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11783 void vst3_lane_f16_ptr(__transfersize(3) __fp16
* ptr
, float16x4x3_t
* val
, __constrange(0,3) int lane
);
11784 //current IA SIMD doesn't support float16
11786 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
11787 void vst3_lane_f32_ptr(__transfersize(3) float32_t
* ptr
, float32x2x3_t
* val
, __constrange(0,1) int lane
);
11788 _NEON2SSE_INLINE
void vst3_lane_f32_ptr(__transfersize(3) float32_t
* ptr
, float32x2x3_t
* val
, __constrange(0,1) int lane
)
11790 *(ptr
) = val
->val
[0].m64_f32
[lane
];
11791 *(ptr
+ 1) = val
->val
[1].m64_f32
[lane
];
11792 *(ptr
+ 2) = val
->val
[2].m64_f32
[lane
];
11794 #define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
11796 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
11797 void vst3_lane_p8_ptr(__transfersize(3) poly8_t
* ptr
, poly8x8x3_t
* val
, __constrange(0,7) int lane
);
11798 #define vst3_lane_p8 vst3_lane_u8
11800 //void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
11801 void vst3_lane_p16_ptr(__transfersize(3) poly16_t
* ptr
, poly16x4x3_t
* val
, __constrange(0,3) int lane
);
11802 #define vst3_lane_p16 vst3_lane_s16
11804 //******************************** Quadruple lanes stores ***********************************************
11805 //*******************************************************************************************************
11806 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11807 _NEON2SSE_INLINE
void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr
, uint16x8x4_t
* val4
, __constrange(0,7) int lane
)
11809 vst2q_lane_u16_ptr(ptr
, (uint16x8x2_t
*)val4
->val
, lane
);
11810 vst2q_lane_u16_ptr((ptr
+ 2),((uint16x8x2_t
*)val4
->val
+ 1), lane
);
11812 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
11814 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11815 _NEON2SSE_INLINE
void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x4x4_t
* val4
, __constrange(0,3) int lane
)
11817 vst2q_lane_u32_ptr(ptr
, (uint32x4x2_t
*)val4
->val
, lane
);
11818 vst2q_lane_u32_ptr((ptr
+ 2), ((uint32x4x2_t
*)val4
->val
+ 1), lane
);
11820 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
11822 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11823 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr
, int16x8x4_t
* val
, __constrange(0,7) int lane
);
11824 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
11826 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11827 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr
, int32x4x4_t
* val
, __constrange(0,3) int lane
);
11828 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
11830 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11831 void vst4q_lane_f16_ptr(__transfersize(4) __fp16
* ptr
, float16x8x4_t
* val
, __constrange(0,7) int lane
);
11832 //current IA SIMD doesn't support float16
11834 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11835 _NEON2SSE_INLINE
void vst4q_lane_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x4x4_t
* val
, __constrange(0,3) int lane
)
11837 vst1q_lane_f32(ptr
, val
->val
[0], lane
);
11838 vst1q_lane_f32((ptr
+ 1), val
->val
[1], lane
);
11839 vst1q_lane_f32((ptr
+ 2), val
->val
[2], lane
);
11840 vst1q_lane_f32((ptr
+ 3), val
->val
[3], lane
);
11842 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
11844 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
11845 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t
* ptr
, poly16x8x4_t
* val
, __constrange(0,7) int lane
);
11846 #define vst4q_lane_p16 vst4q_lane_u16
11848 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11849 _NEON2SSE_INLINE
void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr
, uint8x8x4_t
* val
, __constrange(0,7) int lane
)
11851 *(ptr
) = val
->val
[0].m64_u8
[lane
];
11852 *(ptr
+ 1) = val
->val
[1].m64_u8
[lane
];
11853 *(ptr
+ 2) = val
->val
[2].m64_u8
[lane
];
11854 *(ptr
+ 3) = val
->val
[3].m64_u8
[lane
];
11856 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
11858 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11859 _NEON2SSE_INLINE
void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr
, uint16x4x4_t
* val
, __constrange(0,3) int lane
)
11861 *(ptr
) = val
->val
[0].m64_u16
[lane
];
11862 *(ptr
+ 1) = val
->val
[1].m64_u16
[lane
];
11863 *(ptr
+ 2) = val
->val
[2].m64_u16
[lane
];
11864 *(ptr
+ 3) = val
->val
[3].m64_u16
[lane
];
11866 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
11868 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11869 _NEON2SSE_INLINE
void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr
, uint32x2x4_t
* val
, __constrange(0,1) int lane
)
11871 *(ptr
) = val
->val
[0].m64_u32
[lane
];
11872 *(ptr
+ 1) = val
->val
[1].m64_u32
[lane
];
11873 *(ptr
+ 2) = val
->val
[2].m64_u32
[lane
];
11874 *(ptr
+ 3) = val
->val
[3].m64_u32
[lane
];
11876 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
11878 //void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11879 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
11881 //void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11882 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
11884 //void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11885 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
11887 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11888 void vst4_lane_f16_ptr(__transfersize(4) __fp16
* ptr
, float16x4x4_t
* val
, __constrange(0,3) int lane
);
11889 //current IA SIMD doesn't support float16
11891 void vst4_lane_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x2x4_t
* val
, __constrange(0,1) int lane
); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11892 _NEON2SSE_INLINE
void vst4_lane_f32_ptr(__transfersize(4) float32_t
* ptr
, float32x2x4_t
* val
, __constrange(0,1) int lane
)
11894 *(ptr
) = val
->val
[0].m64_f32
[lane
];
11895 *(ptr
+ 1) = val
->val
[1].m64_f32
[lane
];
11896 *(ptr
+ 2) = val
->val
[2].m64_f32
[lane
];
11897 *(ptr
+ 3) = val
->val
[3].m64_f32
[lane
];
11899 #define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
11901 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11902 void vst4_lane_p8_ptr(__transfersize(4) poly8_t
* ptr
, poly8x8x4_t
* val
, __constrange(0,7) int lane
);
11903 #define vst4_lane_p8 vst4_lane_u8
11905 //void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
11906 void vst4_lane_p16_ptr(__transfersize(4) poly16_t
* ptr
, poly16x4x4_t
* val
, __constrange(0,3) int lane
);
11907 #define vst4_lane_p16 vst4_lane_u16
11909 //**************************************************************************************************
11910 //************************ Extract lanes from a vector ********************************************
11911 //**************************************************************************************************
11912 //These intrinsics extract a single lane (element) from a vector.
11913 uint8_t vget_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VMOV.U8 r0, d0[0]
11914 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
11916 uint16_t vget_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VMOV.s16 r0, d0[0]
11917 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
11920 uint32_t vget_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
11921 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
11923 int8_t vget_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VMOV.S8 r0, d0[0]
11924 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
11926 int16_t vget_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VMOV.S16 r0, d0[0]
11927 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
11929 int32_t vget_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
11930 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
11932 poly8_t
vget_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VMOV.U8 r0, d0[0]
11933 #define vget_lane_p8 vget_lane_u8
11935 poly16_t
vget_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VMOV.s16 r0, d0[0]
11936 #define vget_lane_p16 vget_lane_u16
11938 float32_t
vget_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 r0, d0[0]
11939 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
11941 uint8_t vgetq_lane_u8(uint8x16_t vec
, __constrange(0,15) int lane
); // VMOV.U8 r0, d0[0]
11942 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
11944 uint16_t vgetq_lane_u16(uint16x8_t vec
, __constrange(0,7) int lane
); // VMOV.s16 r0, d0[0]
11945 #define vgetq_lane_u16 _MM_EXTRACT_EPI16
11947 uint32_t vgetq_lane_u32(uint32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
11948 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
11950 int8_t vgetq_lane_s8(int8x16_t vec
, __constrange(0,15) int lane
); // VMOV.S8 r0, d0[0]
11951 #define vgetq_lane_s8 vgetq_lane_u8
11953 int16_t vgetq_lane_s16(int16x8_t vec
, __constrange(0,7) int lane
); // VMOV.S16 r0, d0[0]
11954 #define vgetq_lane_s16 vgetq_lane_u16
11956 int32_t vgetq_lane_s32(int32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
11957 #define vgetq_lane_s32 vgetq_lane_u32
11959 poly8_t
vgetq_lane_p8(poly8x16_t vec
, __constrange(0,15) int lane
); // VMOV.U8 r0, d0[0]
11960 #define vgetq_lane_p8 vgetq_lane_u8
11962 poly16_t
vgetq_lane_p16(poly16x8_t vec
, __constrange(0,7) int lane
); // VMOV.s16 r0, d0[0]
11963 #define vgetq_lane_p16 vgetq_lane_u16
11965 float32_t
vgetq_lane_f32(float32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 r0, d0[0]
11966 _NEON2SSE_INLINE float32_t
vgetq_lane_f32(float32x4_t vec
, __constrange(0,3) int lane
)
11969 ilane
= _MM_EXTRACT_PS(vec
,lane
);
11970 return *(float*)&ilane
;
11973 int64_t vget_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV r0,r0,d0
11974 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
11976 uint64_t vget_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV r0,r0,d0
11977 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
11980 int64_t vgetq_lane_s64(int64x2_t vec
, __constrange(0,1) int lane
); // VMOV r0,r0,d0
11981 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
11983 uint64_t vgetq_lane_u64(uint64x2_t vec
, __constrange(0,1) int lane
); // VMOV r0,r0,d0
11984 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
11986 // ***************** Set lanes within a vector ********************************************
11987 // **************************************************************************************
11988 //These intrinsics set a single lane (element) within a vector.
11989 //same functions as vld1_lane_xx ones, but take the value to be set directly.
11991 uint8x8_t
vset_lane_u8(uint8_t value
, uint8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
11992 _NEON2SSE_INLINE uint8x8_t
vset_lane_u8(uint8_t value
, uint8x8_t vec
, __constrange(0,7) int lane
)
11996 return vld1_lane_u8(&val
, vec
, lane
);
11999 uint16x4_t
vset_lane_u16(uint16_t value
, uint16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
12000 _NEON2SSE_INLINE uint16x4_t
vset_lane_u16(uint16_t value
, uint16x4_t vec
, __constrange(0,3) int lane
)
12004 return vld1_lane_u16(&val
, vec
, lane
);
12007 uint32x2_t
vset_lane_u32(uint32_t value
, uint32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
12008 _NEON2SSE_INLINE uint32x2_t
vset_lane_u32(uint32_t value
, uint32x2_t vec
, __constrange(0,1) int lane
)
12012 return vld1_lane_u32(&val
, vec
, lane
);
12015 int8x8_t
vset_lane_s8(int8_t value
, int8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
12016 _NEON2SSE_INLINE int8x8_t
vset_lane_s8(int8_t value
, int8x8_t vec
, __constrange(0,7) int lane
)
12020 return vld1_lane_s8(&val
, vec
, lane
);
12023 int16x4_t
vset_lane_s16(int16_t value
, int16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
12024 _NEON2SSE_INLINE int16x4_t
vset_lane_s16(int16_t value
, int16x4_t vec
, __constrange(0,3) int lane
)
12028 return vld1_lane_s16(&val
, vec
, lane
);
12031 int32x2_t
vset_lane_s32(int32_t value
, int32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
12032 _NEON2SSE_INLINE int32x2_t
vset_lane_s32(int32_t value
, int32x2_t vec
, __constrange(0,1) int lane
)
12036 return vld1_lane_s32(&val
, vec
, lane
);
12039 poly8x8_t
vset_lane_p8(poly8_t value
, poly8x8_t vec
, __constrange(0,7) int lane
); // VMOV.8 d0[0],r0
12040 #define vset_lane_p8 vset_lane_u8
12042 poly16x4_t
vset_lane_p16(poly16_t value
, poly16x4_t vec
, __constrange(0,3) int lane
); // VMOV.16 d0[0],r0
12043 #define vset_lane_p16 vset_lane_u16
12045 float32x2_t
vset_lane_f32(float32_t value
, float32x2_t vec
, __constrange(0,1) int lane
); // VMOV.32 d0[0],r0
12046 _NEON2SSE_INLINE float32x2_t
vset_lane_f32(float32_t value
, float32x2_t vec
, __constrange(0,1) int lane
)
12050 return vld1_lane_f32(&val
, vec
, lane
);
12053 uint8x16_t
vsetq_lane_u8(uint8_t value
, uint8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
12054 _NEON2SSE_INLINE uint8x16_t
vsetq_lane_u8(uint8_t value
, uint8x16_t vec
, __constrange(0,15) int lane
)
12058 return vld1q_lane_u8(&val
, vec
, lane
);
12061 uint16x8_t
vsetq_lane_u16(uint16_t value
, uint16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
12062 _NEON2SSE_INLINE uint16x8_t
vsetq_lane_u16(uint16_t value
, uint16x8_t vec
, __constrange(0,7) int lane
)
12066 return vld1q_lane_u16(&val
, vec
, lane
);
12069 uint32x4_t
vsetq_lane_u32(uint32_t value
, uint32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
12070 _NEON2SSE_INLINE uint32x4_t
vsetq_lane_u32(uint32_t value
, uint32x4_t vec
, __constrange(0,3) int lane
)
12074 return vld1q_lane_u32(&val
, vec
, lane
);
12077 int8x16_t
vsetq_lane_s8(int8_t value
, int8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
12078 _NEON2SSE_INLINE int8x16_t
vsetq_lane_s8(int8_t value
, int8x16_t vec
, __constrange(0,15) int lane
)
12082 return vld1q_lane_s8(&val
, vec
, lane
);
12085 int16x8_t
vsetq_lane_s16(int16_t value
, int16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
12086 _NEON2SSE_INLINE int16x8_t
vsetq_lane_s16(int16_t value
, int16x8_t vec
, __constrange(0,7) int lane
)
12090 return vld1q_lane_s16(&val
, vec
, lane
);
12093 int32x4_t
vsetq_lane_s32(int32_t value
, int32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
12094 _NEON2SSE_INLINE int32x4_t
vsetq_lane_s32(int32_t value
, int32x4_t vec
, __constrange(0,3) int lane
)
12098 return vld1q_lane_s32(&val
, vec
, lane
);
12101 poly8x16_t
vsetq_lane_p8(poly8_t value
, poly8x16_t vec
, __constrange(0,15) int lane
); // VMOV.8 d0[0],r0
12102 #define vsetq_lane_p8 vsetq_lane_u8
12104 poly16x8_t
vsetq_lane_p16(poly16_t value
, poly16x8_t vec
, __constrange(0,7) int lane
); // VMOV.16 d0[0],r0
12105 #define vsetq_lane_p16 vsetq_lane_u16
12107 float32x4_t
vsetq_lane_f32(float32_t value
, float32x4_t vec
, __constrange(0,3) int lane
); // VMOV.32 d0[0],r0
12108 _NEON2SSE_INLINE float32x4_t
vsetq_lane_f32(float32_t value
, float32x4_t vec
, __constrange(0,3) int lane
)
12112 return vld1q_lane_f32(&val
, vec
, lane
);
12115 int64x1_t
vset_lane_s64(int64_t value
, int64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,r0,r0
12116 _NEON2SSE_INLINE int64x1_t
vset_lane_s64(int64_t value
, int64x1_t vec
, __constrange(0,0) int lane
)
12120 return vld1_lane_s64(&val
, vec
, lane
);
12123 uint64x1_t
vset_lane_u64(uint64_t value
, uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,r0,r0
12124 _NEON2SSE_INLINE uint64x1_t
vset_lane_u64(uint64_t value
, uint64x1_t vec
, __constrange(0,0) int lane
)
12128 return vld1_lane_u64(&val
, vec
, lane
);
12131 int64x2_t
vsetq_lane_s64(int64_t value
, int64x2_t vec
, __constrange(0,1) int lane
); // VMOV d0,r0,r0
12132 _NEON2SSE_INLINE int64x2_t
vsetq_lane_s64(int64_t value
, int64x2_t vec
, __constrange(0,1) int lane
)
12136 return vld1q_lane_s64(&val
, vec
, lane
);
12139 uint64x2_t
vsetq_lane_u64(uint64_t value
, uint64x2_t vec
, __constrange(0,1) int lane
); // VMOV d0,r0,r0
12140 #define vsetq_lane_u64 vsetq_lane_s64
12142 // *******************************************************************************
12143 // **************** Initialize a vector from bit pattern ***************************
12144 // *******************************************************************************
12145 //These intrinsics create a vector from a literal bit pattern.
12146 int8x8_t
vcreate_s8(uint64_t a
); // VMOV d0,r0,r0
12147 #define vcreate_s8(a) (*(__m64_128*)&(a))
12150 int16x4_t
vcreate_s16(uint64_t a
); // VMOV d0,r0,r0
12151 #define vcreate_s16 vcreate_s8
12153 int32x2_t
vcreate_s32(uint64_t a
); // VMOV d0,r0,r0
12154 #define vcreate_s32 vcreate_s8
12156 float16x4_t
vcreate_f16(uint64_t a
); // VMOV d0,r0,r0
12157 //no IA32 SIMD avalilable
12159 float32x2_t
vcreate_f32(uint64_t a
); // VMOV d0,r0,r0
12160 #define vcreate_f32(a) (*(__m64_128*)&(a))
12162 uint8x8_t
vcreate_u8(uint64_t a
); // VMOV d0,r0,r0
12163 #define vcreate_u8 vcreate_s8
12165 uint16x4_t
vcreate_u16(uint64_t a
); // VMOV d0,r0,r0
12166 #define vcreate_u16 vcreate_s16
12168 uint32x2_t
vcreate_u32(uint64_t a
); // VMOV d0,r0,r0
12169 #define vcreate_u32 vcreate_s32
12171 uint64x1_t
vcreate_u64(uint64_t a
); // VMOV d0,r0,r0
12172 #define vcreate_u64 vcreate_s8
12175 poly8x8_t
vcreate_p8(uint64_t a
); // VMOV d0,r0,r0
12176 #define vcreate_p8 vcreate_u8
12178 poly16x4_t
vcreate_p16(uint64_t a
); // VMOV d0,r0,r0
12179 #define vcreate_p16 vcreate_u16
12181 int64x1_t
vcreate_s64(uint64_t a
); // VMOV d0,r0,r0
12182 #define vcreate_s64 vcreate_u64
12184 //********************* Set all lanes to same value ********************************
12185 //*********************************************************************************
12186 //These intrinsics set all lanes to the same value.
12187 uint8x8_t
vdup_n_u8(uint8_t value
); // VDUP.8 d0,r0
12188 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint8x8_t
vdup_n_u8(uint8_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12192 for (i
= 0; i
<8; i
++) {
12193 res
.m64_u8
[i
] = value
;
12198 uint16x4_t
vdup_n_u16(uint16_t value
); // VDUP.16 d0,r0
12199 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint16x4_t
vdup_n_u16(uint16_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12203 for (i
= 0; i
<4; i
++) {
12204 res
.m64_u16
[i
] = value
;
12209 uint32x2_t
vdup_n_u32(uint32_t value
); // VDUP.32 d0,r0
12210 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(uint32x2_t
vdup_n_u32(uint32_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12213 res
.m64_u32
[0] = value
;
12214 res
.m64_u32
[1] = value
;
12218 int8x8_t
vdup_n_s8(int8_t value
); // VDUP.8 d0,r0
12219 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vdup_n_s8(int8_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12223 for (i
= 0; i
<8; i
++) {
12224 res
.m64_i8
[i
] = value
;
12229 int16x4_t
vdup_n_s16(int16_t value
); // VDUP.16 d0,r0
12230 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int16x4_t
vdup_n_s16(int16_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12234 for (i
= 0; i
<4; i
++) {
12235 res
.m64_i16
[i
] = value
;
12240 int32x2_t
vdup_n_s32(int32_t value
); // VDUP.32 d0,r0
12241 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vdup_n_s32(int32_t value
), _NEON2SSE_REASON_SLOW_SERIAL
)
12244 res
.m64_i32
[0] = value
;
12245 res
.m64_i32
[1] = value
;
12249 poly8x8_t
vdup_n_p8(poly8_t value
); // VDUP.8 d0,r0
12250 #define vdup_n_p8 vdup_n_u8
12252 poly16x4_t
vdup_n_p16(poly16_t value
); // VDUP.16 d0,r0
12253 #define vdup_n_p16 vdup_n_s16
12255 float32x2_t
vdup_n_f32(float32_t value
); // VDUP.32 d0,r0
12256 _NEON2SSE_INLINE float32x2_t
vdup_n_f32(float32_t value
)
12259 res
.m64_f32
[0] = value
;
12260 res
.m64_f32
[1] = value
;
12264 uint8x16_t
vdupq_n_u8(uint8_t value
); // VDUP.8 q0,r0
12265 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
12267 uint16x8_t
vdupq_n_u16(uint16_t value
); // VDUP.16 q0,r0
12268 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
12270 uint32x4_t
vdupq_n_u32(uint32_t value
); // VDUP.32 q0,r0
12271 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
12273 int8x16_t
vdupq_n_s8(int8_t value
); // VDUP.8 q0,r0
12274 #define vdupq_n_s8 _mm_set1_epi8
12276 int16x8_t
vdupq_n_s16(int16_t value
); // VDUP.16 q0,r0
12277 #define vdupq_n_s16 _mm_set1_epi16
12279 int32x4_t
vdupq_n_s32(int32_t value
); // VDUP.32 q0,r0
12280 #define vdupq_n_s32 _mm_set1_epi32
12282 poly8x16_t
vdupq_n_p8(poly8_t value
); // VDUP.8 q0,r0
12283 #define vdupq_n_p8 vdupq_n_u8
12285 poly16x8_t
vdupq_n_p16(poly16_t value
); // VDUP.16 q0,r0
12286 #define vdupq_n_p16 vdupq_n_u16
12288 float32x4_t
vdupq_n_f32(float32_t value
); // VDUP.32 q0,r0
12289 #define vdupq_n_f32 _mm_set1_ps
12291 int64x1_t
vdup_n_s64(int64_t value
); // VMOV d0,r0,r0
12292 _NEON2SSE_INLINE int64x1_t
vdup_n_s64(int64_t value
)
12295 res
.m64_i64
[0] = value
;
12299 uint64x1_t
vdup_n_u64(uint64_t value
); // VMOV d0,r0,r0
12300 _NEON2SSE_INLINE uint64x1_t
vdup_n_u64(uint64_t value
)
12303 res
.m64_u64
[0] = value
;
12307 int64x2_t
vdupq_n_s64(int64_t value
); // VMOV d0,r0,r0
12308 _NEON2SSE_INLINE int64x2_t
vdupq_n_s64(int64_t value
)
12310 _NEON2SSE_ALIGN_16
int64_t value2
[2] = {value
, value
}; //value may be an immediate
12311 return LOAD_SI128(value2
);
12314 uint64x2_t
vdupq_n_u64(uint64_t value
); // VMOV d0,r0,r0
12315 _NEON2SSE_INLINE uint64x2_t
vdupq_n_u64(uint64_t value
)
12317 _NEON2SSE_ALIGN_16
uint64_t val
[2] = {value
, value
}; //value may be an immediate
12318 return LOAD_SI128(val
);
12321 //**** Set all lanes to same value ************************
12322 //Same functions as above - just aliaces.********************
12323 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
12324 uint8x8_t
vmov_n_u8(uint8_t value
); // VDUP.8 d0,r0
12325 #define vmov_n_u8 vdup_n_s8
12327 uint16x4_t
vmov_n_u16(uint16_t value
); // VDUP.16 d0,r0
12328 #define vmov_n_u16 vdup_n_s16
12330 uint32x2_t
vmov_n_u32(uint32_t value
); // VDUP.32 d0,r0
12331 #define vmov_n_u32 vdup_n_u32
12333 int8x8_t
vmov_n_s8(int8_t value
); // VDUP.8 d0,r0
12334 #define vmov_n_s8 vdup_n_s8
12336 int16x4_t
vmov_n_s16(int16_t value
); // VDUP.16 d0,r0
12337 #define vmov_n_s16 vdup_n_s16
12339 int32x2_t
vmov_n_s32(int32_t value
); // VDUP.32 d0,r0
12340 #define vmov_n_s32 vdup_n_s32
12342 poly8x8_t
vmov_n_p8(poly8_t value
); // VDUP.8 d0,r0
12343 #define vmov_n_p8 vdup_n_u8
12345 poly16x4_t
vmov_n_p16(poly16_t value
); // VDUP.16 d0,r0
12346 #define vmov_n_p16 vdup_n_s16
12348 float32x2_t
vmov_n_f32(float32_t value
); // VDUP.32 d0,r0
12349 #define vmov_n_f32 vdup_n_f32
12351 uint8x16_t
vmovq_n_u8(uint8_t value
); // VDUP.8 q0,r0
12352 #define vmovq_n_u8 vdupq_n_u8
12354 uint16x8_t
vmovq_n_u16(uint16_t value
); // VDUP.16 q0,r0
12355 #define vmovq_n_u16 vdupq_n_s16
12357 uint32x4_t
vmovq_n_u32(uint32_t value
); // VDUP.32 q0,r0
12358 #define vmovq_n_u32 vdupq_n_u32
12360 int8x16_t
vmovq_n_s8(int8_t value
); // VDUP.8 q0,r0
12361 #define vmovq_n_s8 vdupq_n_s8
12363 int16x8_t
vmovq_n_s16(int16_t value
); // VDUP.16 q0,r0
12364 #define vmovq_n_s16 vdupq_n_s16
12366 int32x4_t
vmovq_n_s32(int32_t value
); // VDUP.32 q0,r0
12367 #define vmovq_n_s32 vdupq_n_s32
12369 poly8x16_t
vmovq_n_p8(poly8_t value
); // VDUP.8 q0,r0
12370 #define vmovq_n_p8 vdupq_n_u8
12372 poly16x8_t
vmovq_n_p16(poly16_t value
); // VDUP.16 q0,r0
12373 #define vmovq_n_p16 vdupq_n_s16
12375 float32x4_t
vmovq_n_f32(float32_t value
); // VDUP.32 q0,r0
12376 #define vmovq_n_f32 vdupq_n_f32
12378 int64x1_t
vmov_n_s64(int64_t value
); // VMOV d0,r0,r0
12379 #define vmov_n_s64 vdup_n_s64
12381 uint64x1_t
vmov_n_u64(uint64_t value
); // VMOV d0,r0,r0
12382 #define vmov_n_u64 vdup_n_u64
12384 int64x2_t
vmovq_n_s64(int64_t value
); // VMOV d0,r0,r0
12385 #define vmovq_n_s64 vdupq_n_s64
12387 uint64x2_t
vmovq_n_u64(uint64_t value
); // VMOV d0,r0,r0
12388 #define vmovq_n_u64 vdupq_n_u64
12390 //**************Set all lanes to the value of one lane of a vector *************
12391 //****************************************************************************
12392 //here shuffle is better solution than lane extraction followed by set1 function
12393 uint8x8_t
vdup_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
12394 _NEON2SSE_INLINE uint8x8_t
vdup_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
)
12399 valane
= vec
.m64_u8
[lane
];
12400 for (i
= 0; i
<8; i
++) {
12401 res
.m64_u8
[i
] = valane
;
12406 uint16x4_t
vdup_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
12407 _NEON2SSE_INLINE uint16x4_t
vdup_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
)
12411 valane
= vec
.m64_u16
[lane
];
12412 res
.m64_u16
[0] = valane
;
12413 res
.m64_u16
[1] = valane
;
12414 res
.m64_u16
[2] = valane
;
12415 res
.m64_u16
[3] = valane
;
12419 uint32x2_t
vdup_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
12420 _NEON2SSE_INLINE uint32x2_t
vdup_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
)
12423 res
.m64_u32
[0] = vec
.m64_u32
[lane
];
12424 res
.m64_u32
[1] = res
.m64_u32
[0];
12428 int8x8_t
vdup_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
12429 #define vdup_lane_s8 vdup_lane_u8
12431 int16x4_t
vdup_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
12432 #define vdup_lane_s16 vdup_lane_u16
12434 int32x2_t
vdup_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
12435 #define vdup_lane_s32 vdup_lane_u32
12437 poly8x8_t
vdup_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 d0,d0[0]
12438 #define vdup_lane_p8 vdup_lane_u8
12440 poly16x4_t
vdup_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 d0,d0[0]
12441 #define vdup_lane_p16 vdup_lane_s16
12443 float32x2_t
vdup_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 d0,d0[0]
12444 _NEON2SSE_INLINE float32x2_t
vdup_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
)
12447 res
.m64_f32
[0] = vec
.m64_f32
[lane
];
12448 res
.m64_f32
[1] = res
.m64_f32
[0];
12452 uint8x16_t
vdupq_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
12453 _NEON2SSE_INLINE uint8x16_t
vdupq_lane_u8(uint8x8_t vec
, __constrange(0,7) int lane
) // VDUP.8 q0,d0[0]
12455 _NEON2SSE_ALIGN_16
int8_t lanemask8
[16] = {lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
, lane
};
12456 return _mm_shuffle_epi8 (_pM128i(vec
), *(__m128i
*) lanemask8
);
12459 uint16x8_t
vdupq_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
12460 _NEON2SSE_INLINE uint16x8_t
vdupq_lane_u16(uint16x4_t vec
, __constrange(0,3) int lane
) // VDUP.16 q0,d0[0]
12462 //we could use 8bit shuffle for 16 bit as well
12463 const int8_t lane16
= ((int8_t) lane
) << 1;
12464 _NEON2SSE_ALIGN_16
int8_t lanemask_e16
[16] = {lane16
, lane16
+ 1, lane16
, lane16
+ 1, lane16
, lane16
+ 1, lane16
, lane16
+ 1,
12465 lane16
, lane16
+ 1, lane16
, lane16
+ 1, lane16
, lane16
+ 1, lane16
, lane16
+ 1};
12466 return _mm_shuffle_epi8 (_pM128i(vec
), *(__m128i
*)lanemask_e16
);
12469 uint32x4_t
vdupq_lane_u32(uint32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
12470 #define vdupq_lane_u32(vec, lane) _mm_shuffle_epi32 (_pM128i(vec), lane | (lane << 2) | (lane << 4) | (lane << 6))
12472 int8x16_t
vdupq_lane_s8(int8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
12473 #define vdupq_lane_s8 vdupq_lane_u8
12475 int16x8_t
vdupq_lane_s16(int16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
12476 #define vdupq_lane_s16 vdupq_lane_u16
12478 int32x4_t
vdupq_lane_s32(int32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
12479 #define vdupq_lane_s32 vdupq_lane_u32
12481 poly8x16_t
vdupq_lane_p8(poly8x8_t vec
, __constrange(0,7) int lane
); // VDUP.8 q0,d0[0]
12482 #define vdupq_lane_p8 vdupq_lane_u8
12484 poly16x8_t
vdupq_lane_p16(poly16x4_t vec
, __constrange(0,3) int lane
); // VDUP.16 q0,d0[0]
12485 #define vdupq_lane_p16 vdupq_lane_s16
12487 float32x4_t
vdupq_lane_f32(float32x2_t vec
, __constrange(0,1) int lane
); // VDUP.32 q0,d0[0]
12488 #define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane))
12490 int64x1_t
vdup_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,d0
12491 #define vdup_lane_s64(vec,lane) vec
12493 uint64x1_t
vdup_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV d0,d0
12494 #define vdup_lane_u64(vec,lane) vec
12496 int64x2_t
vdupq_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
); // VMOV q0,q0
12497 _NEON2SSE_INLINE int64x2_t
vdupq_lane_s64(int64x1_t vec
, __constrange(0,0) int lane
)
12500 vec128
= _pM128i(vec
);
12501 return _mm_unpacklo_epi64(vec128
,vec128
);
12504 uint64x2_t
vdupq_lane_u64(uint64x1_t vec
, __constrange(0,0) int lane
); // VMOV q0,q0
12505 #define vdupq_lane_u64 vdupq_lane_s64
12507 // ********************************************************************
12508 // ******************** Combining vectors *****************************
12509 // ********************************************************************
12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
12511 int8x16_t
vcombine_s8(int8x8_t low
, int8x8_t high
); // VMOV d0,d0
12512 #define vcombine_s8(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12514 int16x8_t
vcombine_s16(int16x4_t low
, int16x4_t high
); // VMOV d0,d0
12515 #define vcombine_s16(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12517 int32x4_t
vcombine_s32(int32x2_t low
, int32x2_t high
); // VMOV d0,d0
12518 #define vcombine_s32(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12520 int64x2_t
vcombine_s64(int64x1_t low
, int64x1_t high
); // VMOV d0,d0
12521 #define vcombine_s64(low, high) _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
12523 float16x8_t
vcombine_f16(float16x4_t low
, float16x4_t high
); // VMOV d0,d0
12524 //current IA SIMD doesn't support float16
12526 float32x4_t
vcombine_f32(float32x2_t low
, float32x2_t high
); // VMOV d0,d0
12527 _NEON2SSE_INLINE float32x4_t
vcombine_f32(float32x2_t low
, float32x2_t high
)
12530 res
= _mm_unpacklo_epi64(_pM128i(low
), _pM128i(high
) );
12534 uint8x16_t
vcombine_u8(uint8x8_t low
, uint8x8_t high
); // VMOV d0,d0
12535 #define vcombine_u8 vcombine_s8
12537 uint16x8_t
vcombine_u16(uint16x4_t low
, uint16x4_t high
); // VMOV d0,d0
12538 #define vcombine_u16 vcombine_s16
12540 uint32x4_t
vcombine_u32(uint32x2_t low
, uint32x2_t high
); // VMOV d0,d0
12541 #define vcombine_u32 vcombine_s32
12543 uint64x2_t
vcombine_u64(uint64x1_t low
, uint64x1_t high
); // VMOV d0,d0
12544 #define vcombine_u64 vcombine_s64
12546 poly8x16_t
vcombine_p8(poly8x8_t low
, poly8x8_t high
); // VMOV d0,d0
12547 #define vcombine_p8 vcombine_u8
12549 poly16x8_t
vcombine_p16(poly16x4_t low
, poly16x4_t high
); // VMOV d0,d0
12550 #define vcombine_p16 vcombine_u16
12552 //**********************************************************************
12553 //************************* Splitting vectors **************************
12554 //**********************************************************************
12555 //**************** Get high part ******************************************
12556 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
12557 int8x8_t
vget_high_s8(int8x16_t a
); // VMOV d0,d0
12558 _NEON2SSE_INLINE int8x8_t
vget_high_s8(int8x16_t a
)
12562 res
= _mm_unpackhi_epi64(a
,a
); //SSE2
12566 int16x4_t
vget_high_s16(int16x8_t a
); // VMOV d0,d0
12567 _NEON2SSE_INLINE int16x4_t
vget_high_s16(int16x8_t a
)
12571 res
= _mm_unpackhi_epi64(a
,a
); //SSE2
12575 int32x2_t
vget_high_s32(int32x4_t a
); // VMOV d0,d0
12576 _NEON2SSE_INLINE int32x2_t
vget_high_s32(int32x4_t a
)
12580 res
= _mm_unpackhi_epi64(a
,a
); //SSE2
12584 int64x1_t
vget_high_s64(int64x2_t a
); // VMOV d0,d0
12585 _NEON2SSE_INLINE int64x1_t
vget_high_s64(int64x2_t a
)
12589 res
= _mm_unpackhi_epi64(a
,a
); //SSE2
12593 float16x4_t
vget_high_f16(float16x8_t a
); // VMOV d0,d0
12594 // IA32 SIMD doesn't work with 16bit floats currently
12596 float32x2_t
vget_high_f32(float32x4_t a
); // VMOV d0,d0
12597 _NEON2SSE_INLINE float32x2_t
vget_high_f32(float32x4_t a
)
12601 res
= _mm_unpackhi_epi64(_M128i(a
),_M128i(a
));
12605 uint8x8_t
vget_high_u8(uint8x16_t a
); // VMOV d0,d0
12606 #define vget_high_u8 vget_high_s8
12608 uint16x4_t
vget_high_u16(uint16x8_t a
); // VMOV d0,d0
12609 #define vget_high_u16 vget_high_s16
12611 uint32x2_t
vget_high_u32(uint32x4_t a
); // VMOV d0,d0
12612 #define vget_high_u32 vget_high_s32
12614 uint64x1_t
vget_high_u64(uint64x2_t a
); // VMOV d0,d0
12615 #define vget_high_u64 vget_high_s64
12617 poly8x8_t
vget_high_p8(poly8x16_t a
); // VMOV d0,d0
12618 #define vget_high_p8 vget_high_u8
12620 poly16x4_t
vget_high_p16(poly16x8_t a
); // VMOV d0,d0
12621 #define vget_high_p16 vget_high_u16
12623 //********************** Get low part **********************
12624 //**********************************************************
12625 int8x8_t
vget_low_s8(int8x16_t a
); // VMOV d0,d0
12626 _NEON2SSE_INLINE int8x8_t
vget_low_s8(int8x16_t a
) // VMOV d0,d0
12632 int16x4_t
vget_low_s16(int16x8_t a
); // VMOV d0,d0
12633 _NEON2SSE_INLINE int16x4_t
vget_low_s16(int16x8_t a
) // VMOV d0,d0
12639 int32x2_t
vget_low_s32(int32x4_t a
); // VMOV d0,d0
12640 _NEON2SSE_INLINE int32x2_t
vget_low_s32(int32x4_t a
) // VMOV d0,d0
12646 int64x1_t
vget_low_s64(int64x2_t a
); // VMOV d0,d0
12647 _NEON2SSE_INLINE int64x1_t
vget_low_s64(int64x2_t a
) // VMOV d0,d0
12653 float16x4_t
vget_low_f16(float16x8_t a
); // VMOV d0,d0
12654 // IA32 SIMD doesn't work with 16bit floats currently
12656 float32x2_t
vget_low_f32(float32x4_t a
); // VMOV d0,d0
12657 _NEON2SSE_INLINE float32x2_t
vget_low_f32(float32x4_t a
)
12664 uint8x8_t
vget_low_u8(uint8x16_t a
); // VMOV d0,d0
12665 #define vget_low_u8 vget_low_s8
12667 uint16x4_t
vget_low_u16(uint16x8_t a
); // VMOV d0,d0
12668 #define vget_low_u16 vget_low_s16
12670 uint32x2_t
vget_low_u32(uint32x4_t a
); // VMOV d0,d0
12671 #define vget_low_u32 vget_low_s32
12673 uint64x1_t
vget_low_u64(uint64x2_t a
); // VMOV d0,d0
12674 #define vget_low_u64 vget_low_s64
12676 poly8x8_t
vget_low_p8(poly8x16_t a
); // VMOV d0,d0
12677 #define vget_low_p8 vget_low_u8
12679 poly16x4_t
vget_low_p16(poly16x8_t a
); // VMOV d0,d0
12680 #define vget_low_p16 vget_low_s16
12682 //**************************************************************************
12683 //************************ Converting vectors **********************************
12684 //**************************************************************************
12685 //************* Convert from float ***************************************
12686 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
12687 int32x2_t
vcvt_s32_f32(float32x2_t a
); // VCVT.S32.F32 d0, d0
12688 _NEON2SSE_INLINE int32x2_t
vcvt_s32_f32(float32x2_t a
)
12692 res
= _mm_cvttps_epi32(_pM128(a
)); //use low 64 bits of result only
12696 uint32x2_t
vcvt_u32_f32(float32x2_t a
); // VCVT.U32.F32 d0, d0
12697 _NEON2SSE_INLINE uint32x2_t
vcvt_u32_f32(float32x2_t a
)
12699 //may be not effective compared with a serial SIMD solution
12702 res
= vcvtq_u32_f32(_pM128(a
));
12706 int32x4_t
vcvtq_s32_f32(float32x4_t a
); // VCVT.S32.F32 q0, q0
12707 #define vcvtq_s32_f32 _mm_cvttps_epi32
12709 uint32x4_t
vcvtq_u32_f32(float32x4_t a
); // VCVT.U32.F32 q0, q0
12710 _NEON2SSE_INLINE uint32x4_t
vcvtq_u32_f32(float32x4_t a
) // VCVT.U32.F32 q0, q0
12712 //No single instruction SSE solution but we could implement it as following:
12714 __m128 zero
, mask
, a_pos
, mask_f_max_si
, res
;
12715 _NEON2SSE_ALIGN_16
int32_t c7fffffff
[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
12716 zero
= _mm_setzero_ps();
12717 mask
= _mm_cmpgt_ps(a
, zero
);
12718 a_pos
= _mm_and_ps(a
, mask
);
12719 mask_f_max_si
= _mm_cmpgt_ps(a_pos
,*(__m128
*)c7fffffff
);
12720 res
= _mm_sub_ps(a_pos
, mask_f_max_si
); //if the input fits to signed we don't subtract anything
12721 resi
= _mm_cvttps_epi32(res
);
12722 return _mm_add_epi32(resi
, *(__m128i
*)&mask_f_max_si
);
12725 // ***** Convert to the fixed point with the number of fraction bits specified by b ***********
12726 //*************************************************************************************************
12727 int32x2_t
vcvt_n_s32_f32(float32x2_t a
, __constrange(1,32) int b
); // VCVT.S32.F32 d0, d0, #32
12728 _NEON2SSE_INLINE int32x2_t
vcvt_n_s32_f32(float32x2_t a
, __constrange(1,32) int b
)
12731 return64(vcvtq_n_s32_f32(_pM128(a
),b
));
12734 uint32x2_t
vcvt_n_u32_f32(float32x2_t a
, __constrange(1,32) int b
); // VCVT.U32.F32 d0, d0, #32
12735 _NEON2SSE_INLINE uint32x2_t
vcvt_n_u32_f32(float32x2_t a
, __constrange(1,32) int b
)
12739 convconst
= (float)((uint32_t)1 << b
);
12740 res
.m64_u32
[0] = (uint32_t) (a
.m64_f32
[0] * convconst
);
12741 res
.m64_u32
[1] = (uint32_t) (a
.m64_f32
[1] * convconst
);
12745 int32x4_t
vcvtq_n_s32_f32(float32x4_t a
, __constrange(1,32) int b
); // VCVT.S32.F32 q0, q0, #32
12746 _NEON2SSE_INLINE int32x4_t
vcvtq_n_s32_f32(float32x4_t a
, __constrange(1,32) int b
)
12749 _NEON2SSE_ALIGN_16
uint32_t cmask
[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
12752 convconst
= (float)(1 << b
);
12753 cconst128
= vdupq_n_f32(convconst
);
12754 res
= _mm_cvttps_epi32(_mm_mul_ps(a
,cconst128
));
12755 mask
= _mm_cmpeq_epi32 (res
, *(__m128i
*)cmask
);
12756 return _mm_xor_si128 (res
, mask
); //res saturated for 0x80000000
12759 uint32x4_t
vcvtq_n_u32_f32(float32x4_t a
, __constrange(1,32) int b
); // VCVT.U32.F32 q0, q0, #32
12760 _NEON2SSE_INLINE uint32x4_t
vcvtq_n_u32_f32(float32x4_t a
, __constrange(1,32) int b
)
12764 convconst
= (float)(1 << b
);
12765 cconst128
= vdupq_n_f32(convconst
);
12766 return vcvtq_u32_f32(_mm_mul_ps(a
,cconst128
));
12769 //***************** Convert to float *************************
12770 //*************************************************************
12771 float32x2_t
vcvt_f32_s32(int32x2_t a
); // VCVT.F32.S32 d0, d0
12772 _NEON2SSE_INLINE float32x2_t
vcvt_f32_s32(int32x2_t a
) //use low 64 bits
12775 res
.m64_f32
[0] = (float) a
.m64_i32
[0];
12776 res
.m64_f32
[1] = (float) a
.m64_i32
[1];
12780 float32x2_t
vcvt_f32_u32(uint32x2_t a
); // VCVT.F32.U32 d0, d0
12781 _NEON2SSE_INLINE float32x2_t
vcvt_f32_u32(uint32x2_t a
)
12784 res
.m64_f32
[0] = (float) a
.m64_u32
[0];
12785 res
.m64_f32
[1] = (float) a
.m64_u32
[1];
12789 float32x4_t
vcvtq_f32_s32(int32x4_t a
); // VCVT.F32.S32 q0, q0
12790 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
12792 float32x4_t
vcvtq_f32_u32(uint32x4_t a
); // VCVT.F32.U32 q0, q0
12793 _NEON2SSE_INLINE float32x4_t
vcvtq_f32_u32(uint32x4_t a
) // VCVT.F32.U32 q0, q0
12795 //solution may be not optimal
12796 __m128 two16
, fHi
, fLo
;
12798 two16
= _mm_set1_ps((float)0x10000); //2^16
12799 // Avoid double rounding by doing two exact conversions
12800 // of high and low 16-bit segments
12801 hi
= _mm_srli_epi32(a
, 16);
12802 lo
= _mm_srli_epi32(_mm_slli_epi32(a
, 16), 16);
12803 fHi
= _mm_mul_ps(_mm_cvtepi32_ps(hi
), two16
);
12804 fLo
= _mm_cvtepi32_ps(lo
);
12805 // do single rounding according to current rounding mode
12806 return _mm_add_ps(fHi
, fLo
);
12809 // ***** Convert to the float from fixed point with the number of fraction bits specified by b ***********
12810 float32x2_t
vcvt_n_f32_s32(int32x2_t a
, __constrange(1,32) int b
); // VCVT.F32.S32 d0, d0, #32
12811 _NEON2SSE_INLINE float32x2_t
vcvt_n_f32_s32(int32x2_t a
, __constrange(1,32) int b
)
12815 convconst
= (float)(1. / ((uint32_t)1 << b
));
12816 res
.m64_f32
[0] = a
.m64_i32
[0] * convconst
;
12817 res
.m64_f32
[1] = a
.m64_i32
[1] * convconst
;
12821 float32x2_t
vcvt_n_f32_u32(uint32x2_t a
, __constrange(1,32) int b
); // VCVT.F32.U32 d0, d0, #32
12822 _NEON2SSE_INLINE float32x2_t
vcvt_n_f32_u32(uint32x2_t a
, __constrange(1,32) int b
) // VCVT.F32.U32 d0, d0, #32
12826 convconst
= (float)(1. / ((uint32_t)1 << b
));
12827 res
.m64_f32
[0] = a
.m64_u32
[0] * convconst
;
12828 res
.m64_f32
[1] = a
.m64_u32
[1] * convconst
;
12832 float32x4_t
vcvtq_n_f32_s32(int32x4_t a
, __constrange(1,32) int b
); // VCVT.F32.S32 q0, q0, #32
12833 _NEON2SSE_INLINE float32x4_t
vcvtq_n_f32_s32(int32x4_t a
, __constrange(1,32) int b
)
12836 __m128 cconst128
, af
;
12837 convconst
= (float)(1. / ((uint32_t)1 << b
));
12838 af
= _mm_cvtepi32_ps(a
);
12839 cconst128
= vdupq_n_f32(convconst
);
12840 return _mm_mul_ps(af
,cconst128
);
12843 float32x4_t
vcvtq_n_f32_u32(uint32x4_t a
, __constrange(1,32) int b
); // VCVT.F32.U32 q0, q0, #32
12844 _NEON2SSE_INLINE float32x4_t
vcvtq_n_f32_u32(uint32x4_t a
, __constrange(1,32) int b
)
12847 __m128 cconst128
, af
;
12848 convconst
= (float)(1. / (1 << b
));
12849 af
= vcvtq_f32_u32(a
);
12850 cconst128
= vdupq_n_f32(convconst
);
12851 return _mm_mul_ps(af
,cconst128
);
12854 //**************Convert between floats ***********************
12855 //************************************************************
12856 float16x4_t
vcvt_f16_f32(float32x4_t a
); // VCVT.F16.F32 d0, q0
12857 //Intel SIMD doesn't support 16bits floats curently
12859 float32x4_t
vcvt_f32_f16(float16x4_t a
); // VCVT.F32.F16 q0, d0
12860 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
12862 //************Vector narrow integer conversion (truncation) ******************
12863 //****************************************************************************
12864 int8x8_t
vmovn_s16(int16x8_t a
); // VMOVN.I16 d0,q0
12865 _NEON2SSE_INLINE int8x8_t
vmovn_s16(int16x8_t a
) // VMOVN.I16 d0,q0
12869 _NEON2SSE_ALIGN_16
int8_t mask8_16_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
12870 res
= _mm_shuffle_epi8 (a
, *(__m128i
*) mask8_16_even_odd
); //use 64 low bits only
12874 int16x4_t
vmovn_s32(int32x4_t a
); // VMOVN.I32 d0,q0
12875 _NEON2SSE_INLINE int16x4_t
vmovn_s32(int32x4_t a
) // VMOVN.I32 d0,q0
12879 _NEON2SSE_ALIGN_16
int8_t mask8_32_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15};
12880 res
= _mm_shuffle_epi8 (a
, *(__m128i
*) mask8_32_even_odd
); //use 64 low bits only
12884 int32x2_t
vmovn_s64(int64x2_t a
); // VMOVN.I64 d0,q0
12885 _NEON2SSE_INLINE int32x2_t
vmovn_s64(int64x2_t a
)
12887 //may be not effective compared with a serial implementation
12890 res
= _mm_shuffle_epi32 (a
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
12894 uint8x8_t
vmovn_u16(uint16x8_t a
); // VMOVN.I16 d0,q0
12895 #define vmovn_u16 vmovn_s16
12897 uint16x4_t
vmovn_u32(uint32x4_t a
); // VMOVN.I32 d0,q0
12898 #define vmovn_u32 vmovn_s32
12900 uint32x2_t
vmovn_u64(uint64x2_t a
); // VMOVN.I64 d0,q0
12901 #define vmovn_u64 vmovn_s64
12903 //**************** Vector long move ***********************
12904 //***********************************************************
12905 int16x8_t
vmovl_s8(int8x8_t a
); // VMOVL.S8 q0,d0
12906 #define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
12908 int32x4_t
vmovl_s16(int16x4_t a
); // VMOVL.S16 q0,d0
12909 #define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
12911 int64x2_t
vmovl_s32(int32x2_t a
); // VMOVL.S32 q0,d0
12912 #define vmovl_s32(a) _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
12914 uint16x8_t
vmovl_u8(uint8x8_t a
); // VMOVL.U8 q0,d0
12915 #define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
12917 uint32x4_t
vmovl_u16(uint16x4_t a
); // VMOVL.s16 q0,d0
12918 #define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
12920 uint64x2_t
vmovl_u32(uint32x2_t a
); // VMOVL.U32 q0,d0
12921 #define vmovl_u32(a) _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
12923 //*************Vector saturating narrow integer*****************
12924 //**************************************************************
12925 int8x8_t
vqmovn_s16(int16x8_t a
); // VQMOVN.S16 d0,q0
12926 _NEON2SSE_INLINE int8x8_t
vqmovn_s16(int16x8_t a
)
12930 res
= _mm_packs_epi16(a
, a
);
12934 int16x4_t
vqmovn_s32(int32x4_t a
); // VQMOVN.S32 d0,q0
12935 _NEON2SSE_INLINE int16x4_t
vqmovn_s32(int32x4_t a
)
12939 res
= _mm_packs_epi32(a
, a
);
12943 int32x2_t
vqmovn_s64(int64x2_t a
); // VQMOVN.S64 d0,q0
12944 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqmovn_s64(int64x2_t a
),_NEON2SSE_REASON_SLOW_SERIAL
) //no effective SIMD solution
12947 _NEON2SSE_ALIGN_16
int64_t atmp
[2];
12948 _mm_store_si128((__m128i
*)atmp
, a
);
12949 if(atmp
[0]>SINT_MAX
) atmp
[0] = SINT_MAX
;
12950 if(atmp
[0]<SINT_MIN
) atmp
[0] = SINT_MIN
;
12951 if(atmp
[1]>SINT_MAX
) atmp
[1] = SINT_MAX
;
12952 if(atmp
[1]<SINT_MIN
) atmp
[1] = SINT_MIN
;
12953 res
.m64_i32
[0] = (int32_t)atmp
[0];
12954 res
.m64_i32
[1] = (int32_t)atmp
[1];
12958 uint8x8_t
vqmovn_u16(uint16x8_t a
); // VQMOVN.s16 d0,q0
12959 _NEON2SSE_INLINE uint8x8_t
vqmovn_u16(uint16x8_t a
) // VQMOVN.s16 d0,q0
12961 //no uint16 to uint8 conversion in SSE, need truncate to max signed first
12963 __m128i c7fff
, a_trunc
;
12964 c7fff
= _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
12965 a_trunc
= _mm_and_si128(a
, c7fff
); // a truncated to max signed
12966 a_trunc
= _mm_packus_epi16 (a_trunc
, a_trunc
); //use low 64bits only
12970 uint16x4_t
vqmovn_u32(uint32x4_t a
); // VQMOVN.U32 d0,q0
12971 _NEON2SSE_INLINE uint16x4_t
vqmovn_u32(uint32x4_t a
) // VQMOVN.U32 d0,q0
12973 //no uint32 to uint16 conversion in SSE, need truncate to max signed first
12975 __m128i c7fffffff
, a_trunc
;
12976 c7fffffff
= _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
12977 a_trunc
= _mm_and_si128(a
, c7fffffff
); // a truncated to max signed
12978 a_trunc
= _MM_PACKUS1_EPI32 (a_trunc
); //use low 64bits only
12982 uint32x2_t
vqmovn_u64(uint64x2_t a
); // VQMOVN.U64 d0,q0
12983 _NEON2SSE_INLINE uint32x2_t
vqmovn_u64(uint64x2_t a
)
12985 //serial solution may be faster
12987 __m128i res_hi
, mask
;
12988 mask
= _mm_setzero_si128();
12989 res_hi
= _mm_srli_epi64(a
, 32);
12990 res_hi
= _mm_cmpeq_epi32(res_hi
, mask
);
12991 mask
= _mm_cmpeq_epi32(mask
,mask
); //all fff
12992 mask
= _mm_andnot_si128(res_hi
,mask
); //inverst res_hi to get >32 bits numbers
12993 res_hi
= _mm_or_si128(a
, mask
);
12994 res_hi
= _mm_shuffle_epi32(res_hi
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
12997 //************* Vector saturating narrow integer signed->unsigned **************
12998 //*****************************************************************************
12999 uint8x8_t
vqmovun_s16(int16x8_t a
); // VQMOVUN.S16 d0,q0
13000 _NEON2SSE_INLINE uint8x8_t
vqmovun_s16(int16x8_t a
)
13004 res
= _mm_packus_epi16(a
, a
); //use low 64bits only
13008 uint16x4_t
vqmovun_s32(int32x4_t a
); // VQMOVUN.S32 d0,q0
13009 _NEON2SSE_INLINE uint16x4_t
vqmovun_s32(int32x4_t a
)
13013 res
= _MM_PACKUS1_EPI32(a
); //use low 64bits only
13017 uint32x2_t
vqmovun_s64(int64x2_t a
); // VQMOVUN.S64 d0,q0
13018 _NEON2SSE_INLINE uint32x2_t
vqmovun_s64(int64x2_t a
)
13021 __m128i res_hi
,res_lo
, zero
, cmp
;
13022 zero
= _mm_setzero_si128();
13023 res_hi
= _mm_srli_epi64(a
, 32);
13024 cmp
= _mm_cmpgt_epi32(zero
, res_hi
); //if cmp<0 the result should be zero
13025 res_lo
= _mm_andnot_si128(cmp
,a
); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0
13026 cmp
= _mm_cmpgt_epi32(res_hi
,zero
); //if cmp positive
13027 res_lo
= _mm_or_si128(res_lo
, cmp
); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
13028 res_lo
= _mm_shuffle_epi32(res_lo
, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
13032 // ********************************************************
13033 // **************** Table look up **************************
13034 // ********************************************************
13035 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
13036 //in a table and generate a new vector. Indexes out of range return 0.
13037 //for Intel SIMD we need to set the MSB to 1 for zero return
13038 uint8x8_t
vtbl1_u8(uint8x8_t a
, uint8x8_t b
); // VTBL.8 d0, {d0}, d0
13039 _NEON2SSE_INLINE uint8x8_t
vtbl1_u8(uint8x8_t a
, uint8x8_t b
)
13042 __m128i c7
, maskgt
, bmask
, b128
;
13043 c7
= _mm_set1_epi8 (7);
13045 maskgt
= _mm_cmpgt_epi8(b128
,c7
);
13046 bmask
= _mm_or_si128(b128
,maskgt
);
13047 bmask
= _mm_shuffle_epi8(_pM128i(a
),bmask
);
13051 int8x8_t
vtbl1_s8(int8x8_t a
, int8x8_t b
); // VTBL.8 d0, {d0}, d0
13052 #define vtbl1_s8 vtbl1_u8
13054 poly8x8_t
vtbl1_p8(poly8x8_t a
, uint8x8_t b
); // VTBL.8 d0, {d0}, d0
13055 #define vtbl1_p8 vtbl1_u8
13057 //Special trick to avoid __declspec(align('8')) won't be aligned" error
13058 //uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13059 uint8x8_t
vtbl2_u8_ptr(uint8x8x2_t
* a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1}, d0
13060 _NEON2SSE_INLINE uint8x8_t
vtbl2_u8_ptr(uint8x8x2_t
* a
, uint8x8_t b
)
13063 __m128i c15
, a01
, maskgt15
, bmask
, b128
;
13064 c15
= _mm_set1_epi8 (15);
13066 maskgt15
= _mm_cmpgt_epi8(b128
,c15
);
13067 bmask
= _mm_or_si128(b128
, maskgt15
);
13068 a01
= _mm_unpacklo_epi64(_pM128i(a
->val
[0]), _pM128i(a
->val
[1]));
13069 a01
= _mm_shuffle_epi8(a01
, bmask
);
13072 #define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
13074 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13075 #define vtbl2_s8 vtbl2_u8
13077 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
13078 #define vtbl2_p8 vtbl2_u8
13080 //Special trick to avoid __declspec(align('16')) won't be aligned" error
13081 //uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13082 _NEON2SSE_INLINE uint8x8_t
vtbl3_u8_ptr(uint8x8x3_t
* a
, uint8x8_t b
)
13084 //solution may be not optimal
13086 __m128i c15
, c23
, maskgt23
, bmask
, maskgt15
, sh0
, sh1
, a01
, b128
;
13087 c15
= _mm_set1_epi8 (15);
13088 c23
= _mm_set1_epi8 (23);
13090 maskgt23
= _mm_cmpgt_epi8(b128
,c23
);
13091 bmask
= _mm_or_si128(b128
, maskgt23
);
13092 maskgt15
= _mm_cmpgt_epi8(b128
,c15
);
13093 a01
= _mm_unpacklo_epi64(_pM128i(a
->val
[0]),_pM128i(a
->val
[1]));
13094 sh0
= _mm_shuffle_epi8(a01
, bmask
);
13095 sh1
= _mm_shuffle_epi8(_pM128i(a
->val
[2]), bmask
); //for bi>15 bi is wrapped (bi-=15)
13096 sh0
= _MM_BLENDV_EPI8(sh0
, sh1
, maskgt15
); //SSE4.1
13099 #define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
13101 //int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13102 int8x8_t
vtbl3_s8_ptr(int8x8x3_t
* a
, int8x8_t b
); // VTBL.8 d0, {d0, d1, d2}, d0
13103 #define vtbl3_s8 vtbl3_u8
13105 //poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
13106 poly8x8_t
vtbl3_p8_ptr(poly8x8x3_t
* a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2}, d0
13107 #define vtbl3_p8 vtbl3_u8
13109 //uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13110 _NEON2SSE_INLINE uint8x8_t
vtbl4_u8_ptr(uint8x8x4_t
* a
, uint8x8_t b
)
13112 //solution may be not optimal
13114 __m128i c15
, c31
, maskgt31
, bmask
, maskgt15
, sh0
, sh1
, a01
, a23
, b128
;
13115 c15
= _mm_set1_epi8 (15);
13116 c31
= _mm_set1_epi8 (31);
13118 maskgt31
= _mm_cmpgt_epi8(b128
,c31
);
13119 bmask
= _mm_or_si128(b128
, maskgt31
);
13120 maskgt15
= _mm_cmpgt_epi8(b128
,c15
);
13121 a01
= _mm_unpacklo_epi64(_pM128i(a
->val
[0]),_pM128i(a
->val
[1]));
13122 a23
= _mm_unpacklo_epi64(_pM128i(a
->val
[2]),_pM128i(a
->val
[3]));
13123 sh0
= _mm_shuffle_epi8(a01
, bmask
);
13124 sh1
= _mm_shuffle_epi8(a23
, bmask
); //for bi>15 bi is wrapped (bi-=15)
13125 sh0
= _MM_BLENDV_EPI8 (sh0
, sh1
, maskgt15
); //SSE4.1
13128 #define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
13130 //int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13131 int8x8_t
vtbl4_s8_ptr(int8x8x4_t
* a
, int8x8_t b
); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13132 #define vtbl4_s8 vtbl4_u8
13134 //poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13135 poly8x8_t
vtbl4_p8_ptr(poly8x8x4_t
* a
, uint8x8_t b
); // VTBL.8 d0, {d0, d1, d2, d3}, d0
13136 #define vtbl4_p8 vtbl4_u8
13138 //****************** Extended table look up intrinsics ***************************
13139 //**********************************************************************************
13140 //VTBX (Vector Table Extension) works in the same way as VTBL do,
13141 // except that indexes out of range leave the destination element unchanged.
13143 uint8x8_t
vtbx1_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VTBX.8 d0, {d0}, d0
13144 _NEON2SSE_INLINE uint8x8_t
vtbx1_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
)
13147 __m128i c7
, maskgt
, sh
, c128
;
13148 c7
= _mm_set1_epi8 (7);
13150 maskgt
= _mm_cmpgt_epi8(c128
,c7
);
13151 c7
= _mm_and_si128(maskgt
,_pM128i(a
));
13152 sh
= _mm_shuffle_epi8(_pM128i(b
),c128
);
13153 sh
= _mm_andnot_si128(maskgt
,sh
);
13154 sh
= _mm_or_si128(sh
,c7
);
13158 int8x8_t
vtbx1_s8(int8x8_t a
, int8x8_t b
, int8x8_t c
); // VTBX.8 d0, {d0}, d0
13159 #define vtbx1_s8 vtbx1_u8
13161 poly8x8_t
vtbx1_p8(poly8x8_t a
, poly8x8_t b
, uint8x8_t c
); // VTBX.8 d0, {d0}, d0
13162 #define vtbx1_p8 vtbx1_u8
13164 //Special trick to avoid __declspec(align('8')) won't be aligned" error
13165 //uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13166 uint8x8_t
vtbx2_u8_ptr(uint8x8_t a
, uint8x8x2_t
* b
, uint8x8_t c
); // VTBX.8 d0, {d0, d1}, d0
13167 _NEON2SSE_INLINE uint8x8_t
vtbx2_u8_ptr(uint8x8_t a
, uint8x8x2_t
* b
, uint8x8_t c
)
13170 __m128i c15
, b01
, maskgt15
, sh
, c128
;
13171 c15
= _mm_set1_epi8 (15);
13173 maskgt15
= _mm_cmpgt_epi8(c128
, c15
);
13174 c15
= _mm_and_si128(maskgt15
, _pM128i(a
));
13175 b01
= _mm_unpacklo_epi64(_pM128i(b
->val
[0]), _pM128i(b
->val
[1]));
13176 sh
= _mm_shuffle_epi8(b01
, c128
);
13177 sh
= _mm_andnot_si128(maskgt15
, sh
);
13178 sh
= _mm_or_si128(sh
,c15
);
13181 #define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
13183 //int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13184 #define vtbx2_s8 vtbx2_u8
13186 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
13187 #define vtbx2_p8 vtbx2_u8
13189 //uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
13190 _NEON2SSE_INLINE uint8x8_t
vtbx3_u8_ptr(uint8x8_t a
, uint8x8x3_t
* b
, uint8x8_t c
)
13192 //solution may be not optimal
13194 __m128i c15
, c23
, maskgt15
, maskgt23
, sh0
, sh1
, b01
, c128
;
13195 c15
= _mm_set1_epi8 (15);
13196 c23
= _mm_set1_epi8 (23);
13198 maskgt15
= _mm_cmpgt_epi8(c128
,c15
);
13199 maskgt23
= _mm_cmpgt_epi8(c128
,c23
);
13200 c23
= _mm_and_si128(maskgt23
, _pM128i(a
));
13201 b01
= _mm_unpacklo_epi64(_pM128i(b
->val
[0]),_pM128i(b
->val
[1]));
13202 sh0
= _mm_shuffle_epi8(b01
, c128
);
13203 sh1
= _mm_shuffle_epi8(_pM128i(b
->val
[2]), c128
); //for bi>15 bi is wrapped (bi-=15)
13204 sh0
= _MM_BLENDV_EPI8(sh0
, sh1
, maskgt15
);
13205 sh0
= _mm_andnot_si128(maskgt23
,sh0
);
13206 sh0
= _mm_or_si128(sh0
,c23
);
13209 #define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
13211 //int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13212 int8x8_t
vtbx3_s8_ptr(int8x8_t a
, int8x8x3_t
* b
, int8x8_t c
);
13213 #define vtbx3_s8 vtbx3_u8
13215 //poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
13216 poly8x8_t
vtbx3_p8_ptr(poly8x8_t a
, poly8x8x3_t
* b
, uint8x8_t c
);
13217 #define vtbx3_p8 vtbx3_u8
13219 //uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
13220 _NEON2SSE_INLINE uint8x8_t
vtbx4_u8_ptr(uint8x8_t a
, uint8x8x4_t
* b
, uint8x8_t c
)
13222 //solution may be not optimal
13224 __m128i c15
, c31
, maskgt15
, maskgt31
, sh0
, sh1
, b01
, b23
, c128
;
13225 c15
= _mm_set1_epi8 (15);
13226 c31
= _mm_set1_epi8 (31);
13228 maskgt15
= _mm_cmpgt_epi8(c128
,c15
);
13229 maskgt31
= _mm_cmpgt_epi8(c128
,c31
);
13230 c31
= _mm_and_si128(maskgt31
, _pM128i(a
));
13232 b01
= _mm_unpacklo_epi64(_pM128i(b
->val
[0]),_pM128i(b
->val
[1]));
13233 b23
= _mm_unpacklo_epi64(_pM128i(b
->val
[2]),_pM128i(b
->val
[3]));
13234 sh0
= _mm_shuffle_epi8(b01
, c128
);
13235 sh1
= _mm_shuffle_epi8(b23
, c128
); //for bi>15 bi is wrapped (bi-=15)
13236 sh0
= _MM_BLENDV_EPI8(sh0
, sh1
, maskgt15
);
13237 sh0
= _mm_andnot_si128(maskgt31
,sh0
);
13238 sh0
= _mm_or_si128(sh0
,c31
);
13241 #define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
13243 //int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13244 int8x8_t
vtbx4_s8_ptr(int8x8_t a
, int8x8x4_t
* b
, int8x8_t c
);
13245 #define vtbx4_s8 vtbx4_u8
13247 //poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
13248 poly8x8_t
vtbx4_p8_ptr(poly8x8_t a
, poly8x8x4_t
* b
, uint8x8_t c
);
13249 #define vtbx4_p8 vtbx4_u8
13251 //*************************************************************************************************
13252 // *************************** Operations with a scalar value *********************************
13253 //*************************************************************************************************
13255 //******* Vector multiply accumulate by scalar *************************************************
13256 //**********************************************************************************************
13257 int16x4_t
vmla_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 d0, d0, d0[0]
13258 _NEON2SSE_INLINE int16x4_t
vmla_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLA.I16 d0, d0, d0[0]
13262 c
= vget_lane_s16(v
, l
);
13263 scalar
= vdup_n_s16(c
);
13264 return vmla_s16(a
, b
, scalar
);
13267 int32x2_t
vmla_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 d0, d0, d0[0]
13268 _NEON2SSE_INLINE int32x2_t
vmla_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLA.I32 d0, d0, d0[0]
13272 c
= vget_lane_s32(v
, l
);
13273 scalar
= vdup_n_s32(c
);
13274 return vmla_s32(a
, b
, scalar
);
13277 uint16x4_t
vmla_lane_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 d0, d0, d0[0]
13278 #define vmla_lane_u16 vmla_lane_s16
13281 uint32x2_t
vmla_lane_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 d0, d0, d0[0]
13282 #define vmla_lane_u32 vmla_lane_s32
13284 float32x2_t
vmla_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLA.F32 d0, d0, d0[0]
13285 _NEON2SSE_INLINE float32x2_t
vmla_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
)
13289 vlane
= vget_lane_f32(v
, l
);
13290 c
= vdup_n_f32(vlane
);
13291 return vmla_f32(a
,b
,c
);
13294 int16x8_t
vmlaq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 q0, q0, d0[0]
13295 _NEON2SSE_INLINE int16x8_t
vmlaq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLA.I16 q0, q0, d0[0]
13299 vlane
= vget_lane_s16(v
, l
);
13300 c
= vdupq_n_s16(vlane
);
13301 return vmlaq_s16(a
,b
,c
);
13304 int32x4_t
vmlaq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 q0, q0, d0[0]
13305 _NEON2SSE_INLINE int32x4_t
vmlaq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLA.I32 q0, q0, d0[0]
13309 vlane
= vget_lane_s32(v
, l
);
13310 c
= vdupq_n_s32(vlane
);
13311 return vmlaq_s32(a
,b
,c
);
13314 uint16x8_t
vmlaq_lane_u16(uint16x8_t a
, uint16x8_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 q0, q0, d0[0]
13315 #define vmlaq_lane_u16 vmlaq_lane_s16
13317 uint32x4_t
vmlaq_lane_u32(uint32x4_t a
, uint32x4_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 q0, q0, d0[0]
13318 #define vmlaq_lane_u32 vmlaq_lane_s32
13320 float32x4_t
vmlaq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLA.F32 q0, q0, d0[0]
13321 _NEON2SSE_INLINE float32x4_t
vmlaq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
) // VMLA.F32 q0, q0, d0[0]
13325 vlane
= vget_lane_f32(v
, l
);
13326 c
= vdupq_n_f32(vlane
);
13327 return vmlaq_f32(a
,b
,c
);
13330 //***************** Vector widening multiply accumulate by scalar **********************
13331 //***************************************************************************************
13332 int32x4_t
vmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLAL.S16 q0, d0, d0[0]
13333 _NEON2SSE_INLINE int32x4_t
vmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLAL.S16 q0, d0, d0[0]
13337 vlane
= vget_lane_s16(v
, l
);
13338 c
= vdup_n_s16(vlane
);
13339 return vmlal_s16(a
, b
, c
);
13342 int64x2_t
vmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLAL.S32 q0, d0, d0[0]
13343 _NEON2SSE_INLINE int64x2_t
vmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLAL.S32 q0, d0, d0[0]
13347 vlane
= vget_lane_s32(v
, l
);
13348 c
= vdup_n_s32(vlane
);
13349 return vmlal_s32(a
, b
, c
);
13352 uint32x4_t
vmlal_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLAL.s16 q0, d0, d0[0]
13353 _NEON2SSE_INLINE uint32x4_t
vmlal_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
) // VMLAL.s16 q0, d0, d0[0]
13357 vlane
= vget_lane_u16(v
, l
);
13358 c
= vdup_n_u16(vlane
);
13359 return vmlal_u16(a
, b
, c
);
13362 uint64x2_t
vmlal_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLAL.U32 q0, d0, d0[0]
13363 _NEON2SSE_INLINE uint64x2_t
vmlal_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
) // VMLAL.U32 q0, d0, d0[0]
13367 vlane
= vget_lane_u32(v
, l
);
13368 c
= vdup_n_u32(vlane
);
13369 return vmlal_u32(a
, b
, c
);
13372 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
13373 // ************************************************************************************************
13374 int32x4_t
vqdmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VQDMLAL.S16 q0, d0, d0[0]
13375 _NEON2SSE_INLINE int32x4_t
vqdmlal_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
)
13379 vlane
= vget_lane_s16(v
, l
);
13380 c
= vdup_n_s16(vlane
);
13381 return vqdmlal_s16(a
, b
, c
);
13384 int64x2_t
vqdmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VQDMLAL.S32 q0, d0, d0[0]
13385 _NEON2SSE_INLINE int64x2_t
vqdmlal_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
)
13389 vlane
= vget_lane_s32(v
, l
);
13390 c
= vdup_n_s32(vlane
);
13391 return vqdmlal_s32(a
, b
, c
);
13394 // ****** Vector multiply subtract by scalar *****************
13395 // *************************************************************
13396 int16x4_t
vmls_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 d0, d0, d0[0]
13397 _NEON2SSE_INLINE int16x4_t
vmls_lane_s16(int16x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLS.I16 d0, d0, d0[0]
13401 vlane
= vget_lane_s16(v
, l
);
13402 c
= vdup_n_s16(vlane
);
13403 return vmls_s16(a
, b
, c
);
13406 int32x2_t
vmls_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 d0, d0, d0[0]
13407 _NEON2SSE_INLINE int32x2_t
vmls_lane_s32(int32x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLS.I32 d0, d0, d0[0]
13411 vlane
= vget_lane_s32(v
, l
);
13412 c
= vdup_n_s32(vlane
);
13413 return vmls_s32(a
, b
, c
);
13416 uint16x4_t
vmls_lane_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 d0, d0, d0[0]
13417 _NEON2SSE_INLINE uint16x4_t
vmls_lane_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
) // VMLS.I16 d0, d0, d0[0]
13421 vlane
= vget_lane_s16(v
, l
);
13422 c
= vdup_n_s16(vlane
);
13423 return vmls_s16(a
, b
, c
);
13426 uint32x2_t
vmls_lane_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 d0, d0, d0[0]
13427 _NEON2SSE_INLINE uint32x2_t
vmls_lane_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
) // VMLS.I32 d0, d0, d0[0]
13431 vlane
= vget_lane_u32(v
, l
);
13432 c
= vdup_n_u32(vlane
);
13433 return vmls_u32(a
, b
, c
);
13436 float32x2_t
vmls_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLS.F32 d0, d0, d0[0]
13437 _NEON2SSE_INLINE float32x2_t
vmls_lane_f32(float32x2_t a
, float32x2_t b
, float32x2_t v
, __constrange(0,1) int l
)
13441 vlane
= (float) vget_lane_f32(v
, l
);
13442 c
= vdup_n_f32(vlane
);
13443 return vmls_f32(a
,b
,c
);
13446 int16x8_t
vmlsq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLS.I16 q0, q0, d0[0]
13447 _NEON2SSE_INLINE int16x8_t
vmlsq_lane_s16(int16x8_t a
, int16x8_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLS.I16 q0, q0, d0[0]
13451 vlane
= vget_lane_s16(v
, l
);
13452 c
= vdupq_n_s16(vlane
);
13453 return vmlsq_s16(a
, b
,c
);
13456 int32x4_t
vmlsq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLS.I32 q0, q0, d0[0]
13457 _NEON2SSE_INLINE int32x4_t
vmlsq_lane_s32(int32x4_t a
, int32x4_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLS.I32 q0, q0, d0[0]
13461 vlane
= vget_lane_s32(v
, l
);
13462 c
= vdupq_n_s32(vlane
);
13463 return vmlsq_s32(a
,b
,c
);
13466 uint16x8_t
vmlsq_lane_u16(uint16x8_t a
, uint16x8_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLA.I16 q0, q0, d0[0]
13467 _NEON2SSE_INLINE uint16x8_t
vmlsq_lane_u16(uint16x8_t a
, uint16x8_t b
, uint16x4_t v
, __constrange(0,3) int l
) // VMLA.I16 q0, q0, d0[0]
13471 vlane
= vget_lane_u16(v
, l
);
13472 c
= vdupq_n_u16(vlane
);
13473 return vmlsq_u16(a
,b
,c
);
13476 uint32x4_t
vmlsq_lane_u32(uint32x4_t a
, uint32x4_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLA.I32 q0, q0, d0[0]
13477 _NEON2SSE_INLINE uint32x4_t
vmlsq_lane_u32(uint32x4_t a
, uint32x4_t b
, uint32x2_t v
, __constrange(0,1) int l
) // VMLA.I32 q0, q0, d0[0]
13481 vlane
= vget_lane_u32(v
, l
);
13482 c
= vdupq_n_u32(vlane
);
13483 return vmlsq_u32(a
,b
,c
);
13486 float32x4_t
vmlsq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
); // VMLA.F32 q0, q0, d0[0]
13487 _NEON2SSE_INLINE float32x4_t
vmlsq_lane_f32(float32x4_t a
, float32x4_t b
, float32x2_t v
, __constrange(0,1) int l
) // VMLA.F32 q0, q0, d0[0]
13491 vlane
= (float) vget_lane_f32(v
, l
);
13492 c
= vdupq_n_f32(vlane
);
13493 return vmlsq_f32(a
,b
,c
);
13496 // **** Vector widening multiply subtract by scalar ****
13497 // ****************************************************
13498 int32x4_t
vmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VMLAL.S16 q0, d0, d0[0]
13499 _NEON2SSE_INLINE int32x4_t
vmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
) // VMLAL.S16 q0, d0, d0[0]
13503 vlane
= vget_lane_s16(v
, l
);
13504 c
= vdup_n_s16(vlane
);
13505 return vmlsl_s16(a
, b
, c
);
13508 int64x2_t
vmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VMLAL.S32 q0, d0, d0[0]
13509 _NEON2SSE_INLINE int64x2_t
vmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
) // VMLAL.S32 q0, d0, d0[0]
13513 vlane
= vget_lane_s32(v
, l
);
13514 c
= vdup_n_s32(vlane
);
13515 return vmlsl_s32(a
, b
, c
);
13518 uint32x4_t
vmlsl_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
); // VMLAL.s16 q0, d0, d0[0]
13519 _NEON2SSE_INLINE uint32x4_t
vmlsl_lane_u16(uint32x4_t a
, uint16x4_t b
, uint16x4_t v
, __constrange(0,3) int l
) // VMLAL.s16 q0, d0, d0[0]
13523 vlane
= vget_lane_s16(v
, l
);
13524 c
= vdup_n_s16(vlane
);
13525 return vmlsl_s16(a
, b
, c
);
13528 uint64x2_t
vmlsl_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
); // VMLAL.U32 q0, d0, d0[0]
13529 _NEON2SSE_INLINE uint64x2_t
vmlsl_lane_u32(uint64x2_t a
, uint32x2_t b
, uint32x2_t v
, __constrange(0,1) int l
) // VMLAL.U32 q0, d0, d0[0]
13533 vlane
= vget_lane_u32(v
, l
);
13534 c
= vdup_n_u32(vlane
);
13535 return vmlsl_u32(a
, b
, c
);
13538 //********* Vector widening saturating doubling multiply subtract by scalar **************************
13539 //******************************************************************************************************
13540 int32x4_t
vqdmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
); // VQDMLSL.S16 q0, d0, d0[0]
13541 _NEON2SSE_INLINE int32x4_t
vqdmlsl_lane_s16(int32x4_t a
, int16x4_t b
, int16x4_t v
, __constrange(0,3) int l
)
13545 vlane
= vget_lane_s16(v
, l
);
13546 c
= vdup_n_s16(vlane
);
13547 return vqdmlsl_s16(a
, b
, c
);
13550 int64x2_t
vqdmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
); // VQDMLSL.S32 q0, d0, d0[0]
13551 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmlsl_lane_s32(int64x2_t a
, int32x2_t b
, int32x2_t v
, __constrange(0,1) int l
), _NEON2SSE_REASON_SLOW_SERIAL
)
13555 vlane
= vget_lane_s32(v
, l
);
13556 c
= vdup_n_s32(vlane
);
13557 return vqdmlsl_s32(a
, b
, c
);
13559 //********** Vector multiply with scalar *****************************
13560 int16x4_t
vmul_n_s16(int16x4_t a
, int16_t b
); // VMUL.I16 d0,d0,d0[0]
13561 _NEON2SSE_INLINE int16x4_t
vmul_n_s16(int16x4_t a
, int16_t b
) // VMUL.I16 d0,d0,d0[0]
13564 b16x4
= vdup_n_s16(b
);
13565 return vmul_s16(a
, b16x4
);
13568 int32x2_t
vmul_n_s32(int32x2_t a
, int32_t b
); // VMUL.I32 d0,d0,d0[0]
13569 _NEON2SSE_INLINE int32x2_t
vmul_n_s32(int32x2_t a
, int32_t b
) // VMUL.I32 d0,d0,d0[0]
13571 //serial solution looks faster
13573 b32x2
= vdup_n_s32(b
);
13574 return vmul_s32(a
, b32x2
);
13577 float32x2_t
vmul_n_f32(float32x2_t a
, float32_t b
); // VMUL.F32 d0,d0,d0[0]
13578 _NEON2SSE_INLINE float32x2_t
vmul_n_f32(float32x2_t a
, float32_t b
) // VMUL.F32 d0,d0,d0[0]
13581 b32x2
= vdup_n_f32(b
);
13582 return vmul_f32(a
, b32x2
);
13585 uint16x4_t
vmul_n_u16(uint16x4_t a
, uint16_t b
); // VMUL.I16 d0,d0,d0[0]
13586 _NEON2SSE_INLINE uint16x4_t
vmul_n_u16(uint16x4_t a
, uint16_t b
) // VMUL.I16 d0,d0,d0[0]
13589 b16x4
= vdup_n_s16(b
);
13590 return vmul_s16(a
, b16x4
);
13593 uint32x2_t
vmul_n_u32(uint32x2_t a
, uint32_t b
); // VMUL.I32 d0,d0,d0[0]
13594 _NEON2SSE_INLINE uint32x2_t
vmul_n_u32(uint32x2_t a
, uint32_t b
) // VMUL.I32 d0,d0,d0[0]
13596 //serial solution looks faster
13598 b32x2
= vdup_n_u32(b
);
13599 return vmul_u32(a
, b32x2
);
13602 int16x8_t
vmulq_n_s16(int16x8_t a
, int16_t b
); // VMUL.I16 q0,q0,d0[0]
13603 _NEON2SSE_INLINE int16x8_t
vmulq_n_s16(int16x8_t a
, int16_t b
) // VMUL.I16 q0,q0,d0[0]
13606 b16x8
= vdupq_n_s16(b
);
13607 return vmulq_s16(a
, b16x8
);
13610 int32x4_t
vmulq_n_s32(int32x4_t a
, int32_t b
); // VMUL.I32 q0,q0,d0[0]
13611 _NEON2SSE_INLINE int32x4_t
vmulq_n_s32(int32x4_t a
, int32_t b
) // VMUL.I32 q0,q0,d0[0]
13614 b32x4
= vdupq_n_s32(b
);
13615 return vmulq_s32(a
, b32x4
);
13618 float32x4_t
vmulq_n_f32(float32x4_t a
, float32_t b
); // VMUL.F32 q0,q0,d0[0]
13619 _NEON2SSE_INLINE float32x4_t
vmulq_n_f32(float32x4_t a
, float32_t b
) // VMUL.F32 q0,q0,d0[0]
13622 b32x4
= vdupq_n_f32(b
);
13623 return vmulq_f32(a
, b32x4
);
13626 uint16x8_t
vmulq_n_u16(uint16x8_t a
, uint16_t b
); // VMUL.I16 q0,q0,d0[0]
13627 _NEON2SSE_INLINE uint16x8_t
vmulq_n_u16(uint16x8_t a
, uint16_t b
) // VMUL.I16 q0,q0,d0[0]
13630 b16x8
= vdupq_n_s16(b
);
13631 return vmulq_s16(a
, b16x8
);
13634 uint32x4_t
vmulq_n_u32(uint32x4_t a
, uint32_t b
); // VMUL.I32 q0,q0,d0[0]
13635 _NEON2SSE_INLINE uint32x4_t
vmulq_n_u32(uint32x4_t a
, uint32_t b
) // VMUL.I32 q0,q0,d0[0]
13638 b32x4
= vdupq_n_u32(b
);
13639 return vmulq_u32(a
, b32x4
);
13642 //********** Vector multiply lane *****************************
13643 int16x4_t
vmul_lane_s16 (int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
);
13644 _NEON2SSE_INLINE int16x4_t
vmul_lane_s16 (int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
)
13648 vlane
= vget_lane_s16(b
, c
);
13649 b16x4
= vdup_n_s16(vlane
);
13650 return vmul_s16(a
, b16x4
);
13653 int32x2_t
vmul_lane_s32 (int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
);
13654 _NEON2SSE_INLINE int32x2_t
vmul_lane_s32 (int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
)
13658 vlane
= vget_lane_s32(b
, c
);
13659 b32x2
= vdup_n_s32(vlane
);
13660 return vmul_s32(a
, b32x2
);
13663 float32x2_t
vmul_lane_f32 (float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
);
13664 _NEON2SSE_INLINE float32x2_t
vmul_lane_f32 (float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
)
13668 vlane
= vget_lane_f32(b
, c
);
13669 b32x2
= vdup_n_f32(vlane
);
13670 return vmul_f32(a
, b32x2
);
13673 uint16x4_t
vmul_lane_u16 (uint16x4_t a
, uint16x4_t b
, __constrange(0,3) int c
);
13674 #define vmul_lane_u16 vmul_lane_s16
13676 uint32x2_t
vmul_lane_u32 (uint32x2_t a
, uint32x2_t b
, __constrange(0,1) int c
);
13677 #define vmul_lane_u32 vmul_lane_s32
13679 int16x8_t
vmulq_lane_s16(int16x8_t a
, int16x4_t b
, __constrange(0,3) int c
);
13680 _NEON2SSE_INLINE int16x8_t
vmulq_lane_s16 (int16x8_t a
, int16x4_t b
, __constrange(0,3) int c
)
13684 vlane
= vget_lane_s16(b
, c
);
13685 b16x8
= vdupq_n_s16(vlane
);
13686 return vmulq_s16(a
, b16x8
);
13689 int32x4_t
vmulq_lane_s32 (int32x4_t a
, int32x2_t b
, __constrange(0,1) int c
);
13690 _NEON2SSE_INLINE int32x4_t
vmulq_lane_s32 (int32x4_t a
, int32x2_t b
, __constrange(0,1) int c
)
13694 vlane
= vget_lane_s32(b
, c
);
13695 b32x4
= vdupq_n_s32(vlane
);
13696 return vmulq_s32(a
, b32x4
);
13699 float32x4_t
vmulq_lane_f32 (float32x4_t a
, float32x2_t b
, __constrange(0,1) int c
);
13700 _NEON2SSE_INLINE float32x4_t
vmulq_lane_f32 (float32x4_t a
, float32x2_t b
, __constrange(0,1) int c
)
13704 vlane
= vget_lane_f32(b
, c
);
13705 b32x4
= vdupq_n_f32(vlane
);
13706 return vmulq_f32(a
, b32x4
);
13709 uint16x8_t
vmulq_lane_u16 (uint16x8_t a
, uint16x4_t b
, __constrange(0,3) int c
);
13710 #define vmulq_lane_u16 vmulq_lane_s16
13712 uint32x4_t
vmulq_lane_u32 (uint32x4_t a
, uint32x2_t b
, __constrange(0,1) int c
);
13713 #define vmulq_lane_u32 vmulq_lane_s32
13715 //**** Vector long multiply with scalar ************
13716 int32x4_t
vmull_n_s16(int16x4_t vec1
, int16_t val2
); // VMULL.S16 q0,d0,d0[0]
13717 _NEON2SSE_INLINE int32x4_t
vmull_n_s16(int16x4_t vec1
, int16_t val2
) // VMULL.S16 q0,d0,d0[0]
13720 b16x4
= vdup_n_s16(val2
);
13721 return vmull_s16(vec1
, b16x4
);
13724 int64x2_t
vmull_n_s32(int32x2_t vec1
, int32_t val2
); // VMULL.S32 q0,d0,d0[0]
13725 _NEON2SSE_INLINE int64x2_t
vmull_n_s32(int32x2_t vec1
, int32_t val2
) // VMULL.S32 q0,d0,d0[0]
13728 b32x2
= vdup_n_s32(val2
);
13729 return vmull_s32(vec1
, b32x2
);
13732 uint32x4_t
vmull_n_u16(uint16x4_t vec1
, uint16_t val2
); // VMULL.s16 q0,d0,d0[0]
13733 _NEON2SSE_INLINE uint32x4_t
vmull_n_u16(uint16x4_t vec1
, uint16_t val2
) // VMULL.s16 q0,d0,d0[0]
13736 b16x4
= vdup_n_s16(val2
);
13737 return vmull_s16(vec1
, b16x4
);
13740 uint64x2_t
vmull_n_u32(uint32x2_t vec1
, uint32_t val2
); // VMULL.U32 q0,d0,d0[0]
13741 _NEON2SSE_INLINE uint64x2_t
vmull_n_u32(uint32x2_t vec1
, uint32_t val2
) // VMULL.U32 q0,d0,d0[0]
13744 b32x2
= vdup_n_u32(val2
);
13745 return vmull_u32(vec1
, b32x2
);
13748 //**** Vector long multiply by scalar ****
13749 int32x4_t
vmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VMULL.S16 q0,d0,d0[0]
13750 _NEON2SSE_INLINE int32x4_t
vmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
) // VMULL.S16 q0,d0,d0[0]
13754 vlane
= vget_lane_s16(val2
, val3
);
13755 b
= vdup_n_s16(vlane
);
13756 return vmull_s16(vec1
, b
);
13759 int64x2_t
vmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VMULL.S32 q0,d0,d0[0]
13760 _NEON2SSE_INLINE int64x2_t
vmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
) // VMULL.S32 q0,d0,d0[0]
13764 vlane
= vget_lane_s32(val2
, val3
);
13765 b
= vdup_n_s32(vlane
);
13766 return vmull_s32(vec1
, b
);
13769 uint32x4_t
vmull_lane_u16(uint16x4_t vec1
, uint16x4_t val2
, __constrange(0, 3) int val3
); // VMULL.s16 q0,d0,d0[0]
13770 _NEON2SSE_INLINE uint32x4_t
vmull_lane_u16(uint16x4_t vec1
, uint16x4_t val2
, __constrange(0, 3) int val3
) // VMULL.s16 q0,d0,d0[0]
13774 vlane
= vget_lane_s16(val2
, val3
);
13775 b
= vdup_n_s16(vlane
);
13776 return vmull_s16(vec1
, b
);
13779 uint64x2_t
vmull_lane_u32(uint32x2_t vec1
, uint32x2_t val2
, __constrange(0, 1) int val3
); // VMULL.U32 q0,d0,d0[0]
13780 _NEON2SSE_INLINE uint64x2_t
vmull_lane_u32(uint32x2_t vec1
, uint32x2_t val2
, __constrange(0, 1) int val3
) // VMULL.U32 q0,d0,d0[0]
13784 vlane
= vget_lane_u32(val2
, val3
);
13785 b
= vdup_n_u32(vlane
);
13786 return vmull_u32(vec1
, b
);
13789 //********* Vector saturating doubling long multiply with scalar *******************
13790 int32x4_t
vqdmull_n_s16(int16x4_t vec1
, int16_t val2
); // VQDMULL.S16 q0,d0,d0[0]
13791 _NEON2SSE_INLINE int32x4_t
vqdmull_n_s16(int16x4_t vec1
, int16_t val2
)
13793 //the serial soulution may be faster due to saturation
13795 b
= vdup_n_s16(val2
);
13796 return vqdmull_s16(vec1
, b
);
13799 int64x2_t
vqdmull_n_s32(int32x2_t vec1
, int32_t val2
); // VQDMULL.S32 q0,d0,d0[0]
13800 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmull_n_s32(int32x2_t vec1
, int32_t val2
), _NEON2SSE_REASON_SLOW_SERIAL
)
13803 b
= vdup_n_s32(val2
);
13804 return vqdmull_s32(vec1
,b
); //slow serial function!!!!
13807 //************* Vector saturating doubling long multiply by scalar ***********************************************
13808 int32x4_t
vqdmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULL.S16 q0,d0,d0[0]
13809 _NEON2SSE_INLINE int32x4_t
vqdmull_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
)
13813 c
= vget_lane_s16(val2
, val3
);
13814 scalar
= vdup_n_s16(c
);
13815 return vqdmull_s16(vec1
, scalar
);
13819 int64x2_t
vqdmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULL.S32 q0,d0,d0[0]
13820 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmull_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
), _NEON2SSE_REASON_SLOW_SERIAL
)
13824 c
= vget_lane_s32(val2
, val3
);
13825 scalar
= vdup_n_s32(c
);
13826 return vqdmull_s32(vec1
,scalar
); //slow serial function!!!!
13829 // *****Vector saturating doubling multiply high with scalar *****
13830 int16x4_t
vqdmulh_n_s16(int16x4_t vec1
, int16_t val2
); // VQDMULH.S16 d0,d0,d0[0]
13831 _NEON2SSE_INLINE int16x4_t
vqdmulh_n_s16(int16x4_t vec1
, int16_t val2
)
13834 return64(vqdmulhq_n_s16(_pM128i(vec1
), val2
));
13837 int32x2_t
vqdmulh_n_s32(int32x2_t vec1
, int32_t val2
); // VQDMULH.S32 d0,d0,d0[0]
13838 _NEON2SSE_INLINE int32x2_t
vqdmulh_n_s32(int32x2_t vec1
, int32_t val2
)
13841 return64(vqdmulhq_n_s32(_pM128i(vec1
), val2
));
13844 int16x8_t
vqdmulhq_n_s16(int16x8_t vec1
, int16_t val2
); // VQDMULH.S16 q0,q0,d0[0]
13845 _NEON2SSE_INLINE int16x8_t
vqdmulhq_n_s16(int16x8_t vec1
, int16_t val2
) // VQDMULH.S16 q0,q0,d0[0]
13847 //solution may be not optimal
13849 scalar
= vdupq_n_s16(val2
);
13850 return vqdmulhq_s16(vec1
, scalar
);
13853 int32x4_t
vqdmulhq_n_s32(int32x4_t vec1
, int32_t val2
); // VQDMULH.S32 q0,q0,d0[0]
13854 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqdmulhq_n_s32(int32x4_t vec1
, int32_t val2
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13857 scalar
= vdupq_n_s32(val2
);
13858 return vqdmulhq_s32(vec1
, scalar
);
13861 //***** Vector saturating doubling multiply high by scalar ****************
13862 int16x4_t
vqdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULH.S16 d0,d0,d0[0]
13863 _NEON2SSE_INLINE int16x4_t
vqdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
) // VQDMULH.S16 d0,d0,d0[0]
13865 //solution may be not optimal
13868 vlane
= vget_lane_s16(val2
, val3
);
13869 scalar
= vdup_n_s16(vlane
);
13870 return vqdmulh_s16(vec1
, scalar
);
13873 int32x2_t
vqdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULH.S32 d0,d0,d0[0]
13874 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13878 vlane
= vget_lane_s32(val2
, val3
);
13879 scalar
= vdup_n_s32(vlane
);
13880 return vqdmulh_s32(vec1
, scalar
);
13883 int16x8_t
vqdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQDMULH.S16 q0,q0,d0[0]
13884 _NEON2SSE_INLINE int16x8_t
vqdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
) // VQDMULH.S16 q0,q0,d0[0]
13886 //solution may be not optimal
13889 vlane
= vget_lane_s16(val2
, val3
);
13890 scalar
= vdupq_n_s16(vlane
);
13891 return vqdmulhq_s16(vec1
, scalar
);
13894 int32x4_t
vqdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQDMULH.S32 q0,q0,d0[0]
13895 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13897 //solution may be not optimal
13900 vlane
= vgetq_lane_s32(_pM128i(val2
), val3
);
13901 scalar
= vdupq_n_s32(vlane
);
13902 return vqdmulhq_s32(vec1
, scalar
);
13905 //******** Vector saturating rounding doubling multiply high with scalar ***
13906 int16x4_t
vqrdmulh_n_s16(int16x4_t vec1
, int16_t val2
); // VQRDMULH.S16 d0,d0,d0[0]
13907 _NEON2SSE_INLINE int16x4_t
vqrdmulh_n_s16(int16x4_t vec1
, int16_t val2
) // VQRDMULH.S16 d0,d0,d0[0]
13909 //solution may be not optimal
13911 scalar
= vdup_n_s16(val2
);
13912 return vqrdmulh_s16(vec1
, scalar
);
13915 int32x2_t
vqrdmulh_n_s32(int32x2_t vec1
, int32_t val2
); // VQRDMULH.S32 d0,d0,d0[0]
13916 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqrdmulh_n_s32(int32x2_t vec1
, int32_t val2
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13919 scalar
= vdup_n_s32(val2
);
13920 return vqrdmulh_s32(vec1
, scalar
);
13923 int16x8_t
vqrdmulhq_n_s16(int16x8_t vec1
, int16_t val2
); // VQRDMULH.S16 q0,q0,d0[0]
13924 _NEON2SSE_INLINE int16x8_t
vqrdmulhq_n_s16(int16x8_t vec1
, int16_t val2
) // VQRDMULH.S16 q0,q0,d0[0]
13926 //solution may be not optimal
13928 scalar
= vdupq_n_s16(val2
);
13929 return vqrdmulhq_s16(vec1
, scalar
);
13932 int32x4_t
vqrdmulhq_n_s32(int32x4_t vec1
, int32_t val2
); // VQRDMULH.S32 q0,q0,d0[0]
13933 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqrdmulhq_n_s32(int32x4_t vec1
, int32_t val2
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13936 scalar
= vdupq_n_s32(val2
);
13937 return vqrdmulhq_s32(vec1
, scalar
);
13940 //********* Vector rounding saturating doubling multiply high by scalar ****
13941 int16x4_t
vqrdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQRDMULH.S16 d0,d0,d0[0]
13942 _NEON2SSE_INLINE int16x4_t
vqrdmulh_lane_s16(int16x4_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
) // VQRDMULH.S16 d0,d0,d0[0]
13944 //solution may be not optimal
13947 vlane
= vget_lane_s16(val2
, val3
);
13948 scalar
= vdup_n_s16(vlane
);
13949 return vqrdmulh_s16(vec1
, scalar
);
13952 int32x2_t
vqrdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQRDMULH.S32 d0,d0,d0[0]
13953 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vqrdmulh_lane_s32(int32x2_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13957 vlane
= vget_lane_s32(val2
, val3
);
13958 scalar
= vdup_n_s32(vlane
);
13959 return vqrdmulh_s32(vec1
, scalar
);
13962 int16x8_t
vqrdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
); // VQRDMULH.S16 q0,q0,d0[0]
13963 _NEON2SSE_INLINE int16x8_t
vqrdmulhq_lane_s16(int16x8_t vec1
, int16x4_t val2
, __constrange(0, 3) int val3
) // VQRDMULH.S16 q0,q0,d0[0]
13965 //solution may be not optimal
13968 vlane
= vget_lane_s16(val2
, val3
);
13969 scalar
= vdupq_n_s16(vlane
);
13970 return vqrdmulhq_s16(vec1
, scalar
);
13973 int32x4_t
vqrdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
); // VQRDMULH.S32 q0,q0,d0[0]
13974 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x4_t
vqrdmulhq_lane_s32(int32x4_t vec1
, int32x2_t val2
, __constrange(0, 1) int val3
), _NEON2SSE_REASON_SLOW_UNEFFECTIVE
)
13976 //solution may be not optimal
13979 vlane
= vgetq_lane_s32(_pM128i(val2
), val3
);
13980 scalar
= vdupq_n_s32(vlane
);
13981 return vqrdmulhq_s32(vec1
, scalar
);
13984 //**************Vector multiply accumulate with scalar *******************
13985 int16x4_t
vmla_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
); // VMLA.I16 d0, d0, d0[0]
13986 _NEON2SSE_INLINE int16x4_t
vmla_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
) // VMLA.I16 d0, d0, d0[0]
13989 scalar
= vdup_n_s16(c
);
13990 return vmla_s16(a
, b
, scalar
);
13993 int32x2_t
vmla_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
); // VMLA.I32 d0, d0, d0[0]
13994 _NEON2SSE_INLINE int32x2_t
vmla_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
) // VMLA.I32 d0, d0, d0[0]
13997 scalar
= vdup_n_s32(c
);
13998 return vmla_s32(a
, b
, scalar
);
14001 uint16x4_t
vmla_n_u16(uint16x4_t a
, uint16x4_t b
, uint16_t c
); // VMLA.I16 d0, d0, d0[0]
14002 #define vmla_n_u16 vmla_n_s16
14005 uint32x2_t
vmla_n_u32(uint32x2_t a
, uint32x2_t b
, uint32_t c
); // VMLA.I32 d0, d0, d0[0]
14006 #define vmla_n_u32 vmla_n_s32
14009 float32x2_t
vmla_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
); // VMLA.F32 d0, d0, d0[0]
14010 _NEON2SSE_INLINE float32x2_t
vmla_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
) // VMLA.F32 d0, d0, d0[0]
14012 float32x2_t scalar
;
14013 scalar
= vdup_n_f32(c
);
14014 return vmla_f32(a
, b
, scalar
);
14017 int16x8_t
vmlaq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
); // VMLA.I16 q0, q0, d0[0]
14018 _NEON2SSE_INLINE int16x8_t
vmlaq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
) // VMLA.I16 q0, q0, d0[0]
14021 scalar
= vdupq_n_s16(c
);
14022 return vmlaq_s16(a
,b
,scalar
);
14025 int32x4_t
vmlaq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
); // VMLA.I32 q0, q0, d0[0]
14026 _NEON2SSE_INLINE int32x4_t
vmlaq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
) // VMLA.I32 q0, q0, d0[0]
14029 scalar
= vdupq_n_s32(c
);
14030 return vmlaq_s32(a
,b
,scalar
);
14033 uint16x8_t
vmlaq_n_u16(uint16x8_t a
, uint16x8_t b
, uint16_t c
); // VMLA.I16 q0, q0, d0[0]
14034 #define vmlaq_n_u16 vmlaq_n_s16
14036 uint32x4_t
vmlaq_n_u32(uint32x4_t a
, uint32x4_t b
, uint32_t c
); // VMLA.I32 q0, q0, d0[0]
14037 #define vmlaq_n_u32 vmlaq_n_s32
14039 float32x4_t
vmlaq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
); // VMLA.F32 q0, q0, d0[0]
14040 _NEON2SSE_INLINE float32x4_t
vmlaq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
) // VMLA.F32 q0, q0, d0[0]
14042 float32x4_t scalar
;
14043 scalar
= vdupq_n_f32(c
);
14044 return vmlaq_f32(a
,b
,scalar
);
14047 //************Vector widening multiply accumulate with scalar****************************
14048 int32x4_t
vmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VMLAL.S16 q0, d0, d0[0]
14049 _NEON2SSE_INLINE int32x4_t
vmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
) // VMLAL.S16 q0, d0, d0[0]
14052 vc
= vdup_n_s16(c
);
14053 return vmlal_s16(a
, b
, vc
);
14056 int64x2_t
vmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VMLAL.S32 q0, d0, d0[0]
14057 _NEON2SSE_INLINE int64x2_t
vmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
) // VMLAL.S32 q0, d0, d0[0]
14060 vc
= vdup_n_s32(c
);
14061 return vmlal_s32(a
, b
, vc
);
14064 uint32x4_t
vmlal_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
); // VMLAL.s16 q0, d0, d0[0]
14065 _NEON2SSE_INLINE uint32x4_t
vmlal_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
) // VMLAL.s16 q0, d0, d0[0]
14068 vc
= vdup_n_s16(c
);
14069 return vmlal_s16(a
, b
, vc
);
14072 uint64x2_t
vmlal_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
); // VMLAL.U32 q0, d0, d0[0]
14073 _NEON2SSE_INLINE uint64x2_t
vmlal_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
) // VMLAL.U32 q0, d0, d0[0]
14076 vc
= vdup_n_u32(c
);
14077 return vmlal_u32(a
, b
, vc
);
14080 //************ Vector widening saturating doubling multiply accumulate with scalar **************
14081 int32x4_t
vqdmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VQDMLAL.S16 q0, d0, d0[0]
14082 _NEON2SSE_INLINE int32x4_t
vqdmlal_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
)
14084 //not optimal SIMD soulution, serial may be faster
14086 vc
= vdup_n_s16(c
);
14087 return vqdmlal_s16(a
, b
, vc
);
14090 int64x2_t
vqdmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VQDMLAL.S32 q0, d0, d0[0]
14091 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmlal_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
), _NEON2SSE_REASON_SLOW_SERIAL
)
14094 vc
= vdup_n_s32(c
);
14095 return vqdmlal_s32(a
, b
, vc
);
14098 //******** Vector multiply subtract with scalar **************
14099 int16x4_t
vmls_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
); // VMLS.I16 d0, d0, d0[0]
14100 _NEON2SSE_INLINE int16x4_t
vmls_n_s16(int16x4_t a
, int16x4_t b
, int16_t c
) // VMLS.I16 d0, d0, d0[0]
14103 vc
= vdup_n_s16(c
);
14104 return vmls_s16(a
, b
, vc
);
14107 int32x2_t
vmls_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
); // VMLS.I32 d0, d0, d0[0]
14108 _NEON2SSE_INLINE int32x2_t
vmls_n_s32(int32x2_t a
, int32x2_t b
, int32_t c
) // VMLS.I32 d0, d0, d0[0]
14111 vc
= vdup_n_s32(c
);
14112 return vmls_s32(a
, b
, vc
);
14115 uint16x4_t
vmls_n_u16(uint16x4_t a
, uint16x4_t b
, uint16_t c
); // VMLS.I16 d0, d0, d0[0]
14116 _NEON2SSE_INLINE uint16x4_t
vmls_n_u16(uint16x4_t a
, uint16x4_t b
, uint16_t c
) // VMLS.I16 d0, d0, d0[0]
14119 vc
= vdup_n_s16(c
);
14120 return vmls_s16(a
, b
, vc
);
14123 uint32x2_t
vmls_n_u32(uint32x2_t a
, uint32x2_t b
, uint32_t c
); // VMLS.I32 d0, d0, d0[0]
14124 _NEON2SSE_INLINE uint32x2_t
vmls_n_u32(uint32x2_t a
, uint32x2_t b
, uint32_t c
) // VMLS.I32 d0, d0, d0[0]
14127 vc
= vdup_n_u32(c
);
14128 return vmls_u32(a
, b
, vc
);
14131 float32x2_t
vmls_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
); // VMLS.F32 d0, d0, d0[0]
14132 _NEON2SSE_INLINE float32x2_t
vmls_n_f32(float32x2_t a
, float32x2_t b
, float32_t c
)
14135 res
.m64_f32
[0] = a
.m64_f32
[0] - b
.m64_f32
[0] * c
;
14136 res
.m64_f32
[1] = a
.m64_f32
[1] - b
.m64_f32
[1] * c
;
14140 int16x8_t
vmlsq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
); // VMLS.I16 q0, q0, d0[0]
14141 _NEON2SSE_INLINE int16x8_t
vmlsq_n_s16(int16x8_t a
, int16x8_t b
, int16_t c
) // VMLS.I16 q0, q0, d0[0]
14144 vc
= vdupq_n_s16(c
);
14145 return vmlsq_s16(a
, b
,vc
);
14148 int32x4_t
vmlsq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
); // VMLS.I32 q0, q0, d0[0]
14149 _NEON2SSE_INLINE int32x4_t
vmlsq_n_s32(int32x4_t a
, int32x4_t b
, int32_t c
) // VMLS.I32 q0, q0, d0[0]
14152 vc
= vdupq_n_s32(c
);
14153 return vmlsq_s32(a
,b
,vc
);
14156 uint16x8_t
vmlsq_n_u16(uint16x8_t a
, uint16x8_t b
, uint16_t c
); // VMLS.I16 q0, q0, d0[0]
14157 _NEON2SSE_INLINE uint16x8_t
vmlsq_n_u16(uint16x8_t a
, uint16x8_t b
, uint16_t c
) // VMLS.I16 q0, q0, d0[0]
14160 vc
= vdupq_n_u32(c
);
14161 return vmlsq_u32(a
,b
,vc
);
14164 uint32x4_t
vmlsq_n_u32(uint32x4_t a
, uint32x4_t b
, uint32_t c
); // VMLS.I32 q0, q0, d0[0]
14165 _NEON2SSE_INLINE uint32x4_t
vmlsq_n_u32(uint32x4_t a
, uint32x4_t b
, uint32_t c
) // VMLS.I32 q0, q0, d0[0]
14168 vc
= vdupq_n_u32(c
);
14169 return vmlsq_u32(a
,b
,vc
);
14172 float32x4_t
vmlsq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
); // VMLS.F32 q0, q0, d0[0]
14173 _NEON2SSE_INLINE float32x4_t
vmlsq_n_f32(float32x4_t a
, float32x4_t b
, float32_t c
)
14176 vc
= vdupq_n_f32(c
);
14177 return vmlsq_f32(a
,b
,vc
);
14180 //**** Vector widening multiply subtract with scalar ******
14181 int32x4_t
vmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VMLSL.S16 q0, d0, d0[0]
14182 _NEON2SSE_INLINE int32x4_t
vmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
) // VMLSL.S16 q0, d0, d0[0]
14185 vc
= vdup_n_s16(c
);
14186 return vmlsl_s16(a
, b
, vc
);
14189 int64x2_t
vmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VMLSL.S32 q0, d0, d0[0]
14190 _NEON2SSE_INLINE int64x2_t
vmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
) // VMLSL.S32 q0, d0, d0[0]
14193 vc
= vdup_n_s32(c
);
14194 return vmlsl_s32(a
, b
, vc
);
14197 uint32x4_t
vmlsl_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
); // VMLSL.s16 q0, d0, d0[0]
14198 _NEON2SSE_INLINE uint32x4_t
vmlsl_n_u16(uint32x4_t a
, uint16x4_t b
, uint16_t c
) // VMLSL.s16 q0, d0, d0[0]
14201 vc
= vdup_n_u16(c
);
14202 return vmlsl_u16(a
, b
, vc
);
14205 uint64x2_t
vmlsl_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
); // VMLSL.U32 q0, d0, d0[0]
14206 _NEON2SSE_INLINE uint64x2_t
vmlsl_n_u32(uint64x2_t a
, uint32x2_t b
, uint32_t c
) // VMLSL.U32 q0, d0, d0[0]
14209 vc
= vdup_n_u32(c
);
14210 return vmlsl_u32(a
, b
, vc
);
14213 //***** Vector widening saturating doubling multiply subtract with scalar *********
14214 //**********************************************************************************
14215 int32x4_t
vqdmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
); // VQDMLSL.S16 q0, d0, d0[0]
14216 _NEON2SSE_INLINE int32x4_t
vqdmlsl_n_s16(int32x4_t a
, int16x4_t b
, int16_t c
)
14219 vc
= vdup_n_s16(c
);
14220 return vqdmlsl_s16(a
, b
, vc
);
14223 int64x2_t
vqdmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
); // VQDMLSL.S32 q0, d0, d0[0]
14224 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int64x2_t
vqdmlsl_n_s32(int64x2_t a
, int32x2_t b
, int32_t c
), _NEON2SSE_REASON_SLOW_SERIAL
)
14227 vc
= vdup_n_s32(c
);
14228 return vqdmlsl_s32(a
, b
, vc
);
14231 //******************* Vector extract ***********************************************
14232 //*************************************************************************************
14233 //VEXT (Vector Extract) extracts elements from the bottom end of the second operand
14234 //vector and the top end of the first, concatenates them, and places the result in the destination vector
14235 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
14236 int8x8_t
vext_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
14237 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int8x8_t
vext_s8(int8x8_t a
, int8x8_t b
, __constrange(0,7) int c
),_NEON2SSE_REASON_SLOW_SERIAL
)
14241 for (i
= 0; i
<8 - c
; i
++) {
14242 res
.m64_i8
[i
] = a
.m64_i8
[i
+ c
];
14244 for(i
= 0; i
<c
; i
++) {
14245 res
.m64_i8
[8 - c
+ i
] = b
.m64_i8
[i
];
14250 uint8x8_t
vext_u8(uint8x8_t a
, uint8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
14251 #define vext_u8 vext_s8
14252 //same result tested
14254 poly8x8_t
vext_p8(poly8x8_t a
, poly8x8_t b
, __constrange(0,7) int c
); // VEXT.8 d0,d0,d0,#0
14255 #define vext_p8 vext_u8
14257 int16x4_t
vext_s16(int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
14258 _NEON2SSE_INLINE int16x4_t
_NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a
, int16x4_t b
, __constrange(0,3) int c
), _NEON2SSE_REASON_SLOW_SERIAL
)
14262 for (i
= 0; i
<4 - c
; i
++) {
14263 res
.m64_i16
[i
] = a
.m64_i16
[i
+ c
];
14265 for(i
= 0; i
<c
; i
++) {
14266 res
.m64_i16
[4 - c
+ i
] = b
.m64_i16
[i
];
14271 uint16x4_t
vext_u16(uint16x4_t a
, uint16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
14272 #define vext_u16 vext_s16
14274 poly16x4_t
vext_p16(poly16x4_t a
, poly16x4_t b
, __constrange(0,3) int c
); // VEXT.16 d0,d0,d0,#0
14275 #define vext_p16 vext_s16
14277 int32x2_t
vext_s32(int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
14278 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(int32x2_t
vext_s32(int32x2_t a
, int32x2_t b
, __constrange(0,1) int c
), _NEON2SSE_REASON_SLOW_SERIAL
)
14282 res
.m64_i32
[0] = a
.m64_i32
[0];
14283 res
.m64_i32
[1] = a
.m64_i32
[1];
14285 res
.m64_i32
[0] = a
.m64_i32
[1];
14286 res
.m64_i32
[1] = b
.m64_i32
[0];
14291 float32x2_t
vext_f32(float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
14292 _NEON2SSE_INLINE
_NEON2SSE_PERFORMANCE_WARNING(float32x2_t
vext_f32(float32x2_t a
, float32x2_t b
, __constrange(0,1) int c
), _NEON2SSE_REASON_SLOW_SERIAL
)
14296 res
.m64_f32
[0] = a
.m64_f32
[0];
14297 res
.m64_f32
[1] = a
.m64_f32
[1];
14299 res
.m64_f32
[0] = a
.m64_f32
[1];
14300 res
.m64_f32
[1] = b
.m64_f32
[0];
14305 uint32x2_t
vext_u32(uint32x2_t a
, uint32x2_t b
, __constrange(0,1) int c
); // VEXT.32 d0,d0,d0,#0
14306 #define vext_u32 vext_s32
14309 int64x1_t
vext_s64(int64x1_t a
, int64x1_t b
, __constrange(0,0) int c
); // VEXT.64 d0,d0,d0,#0
14310 #define vext_s64(a,b,c) a
14312 uint64x1_t
vext_u64(uint64x1_t a
, uint64x1_t b
, __constrange(0,0) int c
); // VEXT.64 d0,d0,d0,#0
14313 #define vext_u64(a,b,c) a
14315 int8x16_t
vextq_s8(int8x16_t a
, int8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
14316 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14318 uint8x16_t
vextq_u8(uint8x16_t a
, uint8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
14319 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
14321 poly8x16_t
vextq_p8(poly8x16_t a
, poly8x16_t b
, __constrange(0,15) int c
); // VEXT.8 q0,q0,q0,#0
14322 #define vextq_p8 vextq_s8
14324 int16x8_t
vextq_s16(int16x8_t a
, int16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
14325 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14327 uint16x8_t
vextq_u16(uint16x8_t a
, uint16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
14328 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
14330 poly16x8_t
vextq_p16(poly16x8_t a
, poly16x8_t b
, __constrange(0,7) int c
); // VEXT.16 q0,q0,q0,#0
14331 #define vextq_p16 vextq_s16
14333 int32x4_t
vextq_s32(int32x4_t a
, int32x4_t b
, __constrange(0,3) int c
); // VEXT.32 q0,q0,q0,#0
14334 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14336 uint32x4_t
vextq_u32(uint32x4_t a
, uint32x4_t b
, __constrange(0,3) int c
); // VEXT.32 q0,q0,q0,#0
14337 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
14339 float32x4_t
vextq_f32(float32x4_t a
, float32x4_t b
, __constrange(0,3) float c
); // VEXT.32 q0,q0,q0,#0
14340 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
14342 int64x2_t
vextq_s64(int64x2_t a
, int64x2_t b
, __constrange(0,1) int c
); // VEXT.64 q0,q0,q0,#0
14343 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14345 uint64x2_t
vextq_u64(uint64x2_t a
, uint64x2_t b
, __constrange(0,1) int c
); // VEXT.64 q0,q0,q0,#0
14346 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
14348 //************ Reverse vector elements (swap endianness)*****************
14349 //*************************************************************************
14350 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
14351 int8x8_t
vrev64_s8(int8x8_t vec
); // VREV64.8 d0,d0
14352 _NEON2SSE_INLINE int8x8_t
vrev64_s8(int8x8_t vec
)
14356 res
= vrev64q_s8(_pM128i(vec
));
14360 int16x4_t
vrev64_s16(int16x4_t vec
); // VREV64.16 d0,d0
14361 _NEON2SSE_INLINE int16x4_t
vrev64_s16(int16x4_t vec
)
14365 res
= vrev64q_s16(_pM128i(vec
));
14369 int32x2_t
vrev64_s32(int32x2_t vec
); // VREV64.32 d0,d0
14370 _NEON2SSE_INLINE int32x2_t
vrev64_s32(int32x2_t vec
)
14373 res
.m64_i32
[0] = vec
.m64_i32
[1];
14374 res
.m64_i32
[1] = vec
.m64_i32
[0];
14378 uint8x8_t
vrev64_u8(uint8x8_t vec
); // VREV64.8 d0,d0
14379 #define vrev64_u8 vrev64_s8
14381 uint16x4_t
vrev64_u16(uint16x4_t vec
); // VREV64.16 d0,d0
14382 #define vrev64_u16 vrev64_s16
14384 uint32x2_t
vrev64_u32(uint32x2_t vec
); // VREV64.32 d0,d0
14385 #define vrev64_u32 vrev64_s32
14387 poly8x8_t
vrev64_p8(poly8x8_t vec
); // VREV64.8 d0,d0
14388 #define vrev64_p8 vrev64_u8
14390 poly16x4_t
vrev64_p16(poly16x4_t vec
); // VREV64.16 d0,d0
14391 #define vrev64_p16 vrev64_u16
14393 float32x2_t
vrev64_f32(float32x2_t vec
); // VREV64.32 d0,d0
14394 _NEON2SSE_INLINE float32x2_t
vrev64_f32(float32x2_t vec
)
14397 res
.m64_f32
[0] = vec
.m64_f32
[1];
14398 res
.m64_f32
[1] = vec
.m64_f32
[0];
14402 int8x16_t
vrev64q_s8(int8x16_t vec
); // VREV64.8 q0,q0
14403 _NEON2SSE_INLINE int8x16_t
vrev64q_s8(int8x16_t vec
) // VREV64.8 q0,q0
14405 _NEON2SSE_ALIGN_16
int8_t mask_rev_e8
[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
14406 return _mm_shuffle_epi8 (vec
, *(__m128i
*) mask_rev_e8
);
14409 int16x8_t
vrev64q_s16(int16x8_t vec
); // VREV64.16 q0,q0
14410 _NEON2SSE_INLINE int16x8_t
vrev64q_s16(int16x8_t vec
) // VREV64.16 q0,q0
14412 //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
14413 _NEON2SSE_ALIGN_16
int8_t mask_rev_e16
[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
14414 return _mm_shuffle_epi8 (vec
, *(__m128i
*)mask_rev_e16
);
14417 int32x4_t
vrev64q_s32(int32x4_t vec
); // VREV64.32 q0,q0
14418 _NEON2SSE_INLINE int32x4_t
vrev64q_s32(int32x4_t vec
) // VREV64.32 q0,q0
14420 return _mm_shuffle_epi32 (vec
, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
14423 uint8x16_t
vrev64q_u8(uint8x16_t vec
); // VREV64.8 q0,q0
14424 #define vrev64q_u8 vrev64q_s8
14426 uint16x8_t
vrev64q_u16(uint16x8_t vec
); // VREV64.16 q0,q0
14427 #define vrev64q_u16 vrev64q_s16
14429 uint32x4_t
vrev64q_u32(uint32x4_t vec
); // VREV64.32 q0,q0
14430 #define vrev64q_u32 vrev64q_s32
14432 poly8x16_t
vrev64q_p8(poly8x16_t vec
); // VREV64.8 q0,q0
14433 #define vrev64q_p8 vrev64q_u8
14435 poly16x8_t
vrev64q_p16(poly16x8_t vec
); // VREV64.16 q0,q0
14436 #define vrev64q_p16 vrev64q_u16
14438 float32x4_t
vrev64q_f32(float32x4_t vec
); // VREV64.32 q0,q0
14439 #define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1))
14441 //******************** 32 bit shuffles **********************
14442 //************************************************************
14443 int8x8_t
vrev32_s8(int8x8_t vec
); // VREV32.8 d0,d0
14444 _NEON2SSE_INLINE int8x8_t
vrev32_s8(int8x8_t vec
)
14448 res
= vrev32q_s8(_pM128i(vec
));
14452 int16x4_t
vrev32_s16(int16x4_t vec
); // VREV32.16 d0,d0
14453 _NEON2SSE_INLINE int16x4_t
vrev32_s16(int16x4_t vec
)
14457 res
= vrev32q_s16(_pM128i(vec
));
14461 uint8x8_t
vrev32_u8(uint8x8_t vec
); // VREV32.8 d0,d0
14462 #define vrev32_u8 vrev32_s8
14464 uint16x4_t
vrev32_u16(uint16x4_t vec
); // VREV32.16 d0,d0
14465 #define vrev32_u16 vrev32_s16
14467 poly8x8_t
vrev32_p8(poly8x8_t vec
); // VREV32.8 d0,d0
14468 #define vrev32_p8 vrev32_u8
14470 poly16x4_t
vrev32_p16(poly16x4_t vec
); // VREV32.16 d0,d0
14471 #define vrev32_p16 vrev32_u16
14473 int8x16_t
vrev32q_s8(int8x16_t vec
); // VREV32.8 q0,q0
14474 _NEON2SSE_INLINE int8x16_t
vrev32q_s8(int8x16_t vec
) // VREV32.8 q0,q0
14476 _NEON2SSE_ALIGN_16
int8_t mask_rev_e8
[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
14477 return _mm_shuffle_epi8 (vec
, *(__m128i
*) mask_rev_e8
);
14480 int16x8_t
vrev32q_s16(int16x8_t vec
); // VREV32.16 q0,q0
14481 _NEON2SSE_INLINE int16x8_t
vrev32q_s16(int16x8_t vec
) // VREV32.16 q0,q0
14483 _NEON2SSE_ALIGN_16
int8_t mask_rev_e8
[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
14484 return _mm_shuffle_epi8 (vec
, *(__m128i
*) mask_rev_e8
);
14487 uint8x16_t
vrev32q_u8(uint8x16_t vec
); // VREV32.8 q0,q0
14488 #define vrev32q_u8 vrev32q_s8
14490 uint16x8_t
vrev32q_u16(uint16x8_t vec
); // VREV32.16 q0,q0
14491 #define vrev32q_u16 vrev32q_s16
14493 poly8x16_t
vrev32q_p8(poly8x16_t vec
); // VREV32.8 q0,q0
14494 #define vrev32q_p8 vrev32q_u8
14496 poly16x8_t
vrev32q_p16(poly16x8_t vec
); // VREV32.16 q0,q0
14497 #define vrev32q_p16 vrev32q_u16
14499 //************* 16 bit shuffles **********************
14500 //******************************************************
14501 int8x8_t
vrev16_s8(int8x8_t vec
); // VREV16.8 d0,d0
14502 _NEON2SSE_INLINE int8x8_t
vrev16_s8(int8x8_t vec
)
14506 res
= vrev16q_s8(_pM128i(vec
));
14510 uint8x8_t
vrev16_u8(uint8x8_t vec
); // VREV16.8 d0,d0
14511 #define vrev16_u8 vrev16_s8
14513 poly8x8_t
vrev16_p8(poly8x8_t vec
); // VREV16.8 d0,d0
14514 #define vrev16_p8 vrev16_u8
14516 int8x16_t
vrev16q_s8(int8x16_t vec
); // VREV16.8 q0,q0
14517 _NEON2SSE_INLINE int8x16_t
vrev16q_s8(int8x16_t vec
) // VREV16.8 q0,q0
14519 _NEON2SSE_ALIGN_16
int8_t mask_rev8
[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
14520 return _mm_shuffle_epi8 (vec
, *(__m128i
*) mask_rev8
);
14523 uint8x16_t
vrev16q_u8(uint8x16_t vec
); // VREV16.8 q0,q0
14524 #define vrev16q_u8 vrev16q_s8
14526 poly8x16_t
vrev16q_p8(poly8x16_t vec
); // VREV16.8 q0,q0
14527 #define vrev16q_p8 vrev16q_u8
14529 //*********************************************************************
14530 //**************** Other single operand arithmetic *******************
14531 //*********************************************************************
14533 //*********** Absolute: Vd[i] = |Va[i]| **********************************
14534 //************************************************************************
14535 int8x8_t
vabs_s8(int8x8_t a
); // VABS.S8 d0,d0
14536 _NEON2SSE_INLINE int8x8_t
vabs_s8(int8x8_t a
)
14540 res
= _mm_abs_epi8(_pM128i(a
));
14545 int16x4_t
vabs_s16(int16x4_t a
); // VABS.S16 d0,d0
14546 _NEON2SSE_INLINE int16x4_t
vabs_s16(int16x4_t a
)
14550 res
= _mm_abs_epi16(_pM128i(a
));
14554 int32x2_t
vabs_s32(int32x2_t a
); // VABS.S32 d0,d0
14555 _NEON2SSE_INLINE int32x2_t
vabs_s32(int32x2_t a
)
14559 res
= _mm_abs_epi32(_pM128i(a
));
14563 float32x2_t
vabs_f32(float32x2_t a
); // VABS.F32 d0,d0
14564 _NEON2SSE_INLINE float32x2_t
vabs_f32(float32x2_t a
) // VABS.F32 d0,d0
14568 _NEON2SSE_ALIGN_16
int32_t c7fffffff
[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14569 res
= _mm_and_ps (_pM128(a
), *(__m128
*)c7fffffff
); //use 64 low bits only
14574 int8x16_t
vabsq_s8(int8x16_t a
); // VABS.S8 q0,q0
14575 #define vabsq_s8 _mm_abs_epi8
14577 int16x8_t
vabsq_s16(int16x8_t a
); // VABS.S16 q0,q0
14578 #define vabsq_s16 _mm_abs_epi16
14580 int32x4_t
vabsq_s32(int32x4_t a
); // VABS.S32 q0,q0
14581 #define vabsq_s32 _mm_abs_epi32
14583 float32x4_t
vabsq_f32(float32x4_t a
); // VABS.F32 q0,q0
14584 _NEON2SSE_INLINE float32x4_t
vabsq_f32(float32x4_t a
) // VABS.F32 q0,q0
14586 _NEON2SSE_ALIGN_16
int32_t c7fffffff
[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
14587 return _mm_and_ps (a
, *(__m128
*)c7fffffff
);
14590 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
14591 //**********************************************************************
14592 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
14593 int8x8_t
vqabs_s8(int8x8_t a
); // VQABS.S8 d0,d0
14594 _NEON2SSE_INLINE int8x8_t
vqabs_s8(int8x8_t a
)
14598 res
= vqabsq_s8(_pM128i(a
));
14602 int16x4_t
vqabs_s16(int16x4_t a
); // VQABS.S16 d0,d0
14603 _NEON2SSE_INLINE int16x4_t
vqabs_s16(int16x4_t a
)
14607 res
= vqabsq_s16(_pM128i(a
));
14611 int32x2_t
vqabs_s32(int32x2_t a
); // VQABS.S32 d0,d0
14612 _NEON2SSE_INLINE int32x2_t
vqabs_s32(int32x2_t a
)
14616 res
= vqabsq_s32(_pM128i(a
));
14620 int8x16_t
vqabsq_s8(int8x16_t a
); // VQABS.S8 q0,q0
14621 _NEON2SSE_INLINE int8x16_t
vqabsq_s8(int8x16_t a
) // VQABS.S8 q0,q0
14623 __m128i c_128
, abs
, abs_cmp
;
14624 c_128
= _mm_set1_epi8 (0x80); //-128
14625 abs
= _mm_abs_epi8 (a
);
14626 abs_cmp
= _mm_cmpeq_epi8 (abs
, c_128
);
14627 return _mm_xor_si128 (abs
, abs_cmp
);
14630 int16x8_t
vqabsq_s16(int16x8_t a
); // VQABS.S16 q0,q0
14631 _NEON2SSE_INLINE int16x8_t
vqabsq_s16(int16x8_t a
) // VQABS.S16 q0,q0
14633 __m128i c_32768
, abs
, abs_cmp
;
14634 c_32768
= _mm_set1_epi16 (0x8000); //-32768
14635 abs
= _mm_abs_epi16 (a
);
14636 abs_cmp
= _mm_cmpeq_epi16 (abs
, c_32768
);
14637 return _mm_xor_si128 (abs
, abs_cmp
);
14640 int32x4_t
vqabsq_s32(int32x4_t a
); // VQABS.S32 q0,q0
14641 _NEON2SSE_INLINE int32x4_t
vqabsq_s32(int32x4_t a
) // VQABS.S32 q0,q0
14643 __m128i c80000000
, abs
, abs_cmp
;
14644 c80000000
= _mm_set1_epi32 (0x80000000); //most negative value
14645 abs
= _mm_abs_epi32 (a
);
14646 abs_cmp
= _mm_cmpeq_epi32 (abs
, c80000000
);
14647 return _mm_xor_si128 (abs
, abs_cmp
);
14650 //*************** Negate: Vd[i] = - Va[i] *************************************
14651 //*****************************************************************************
14652 //several Negate implementations possible for SIMD.
14653 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
14654 int8x8_t
vneg_s8(int8x8_t a
); // VNE//d0,d0
14655 _NEON2SSE_INLINE int8x8_t
vneg_s8(int8x8_t a
)
14659 res
= vnegq_s8(_pM128i(a
));
14663 int16x4_t
vneg_s16(int16x4_t a
); // VNE//d0,d0
14664 _NEON2SSE_INLINE int16x4_t
vneg_s16(int16x4_t a
)
14668 res
= vnegq_s16(_pM128i(a
));
14672 int32x2_t
vneg_s32(int32x2_t a
); // VNE//d0,d0
14673 _NEON2SSE_INLINE int32x2_t
vneg_s32(int32x2_t a
)
14677 res
= vnegq_s32(_pM128i(a
));
14681 float32x2_t
vneg_f32(float32x2_t a
); // VNE//d0,d0
14682 _NEON2SSE_INLINE float32x2_t
vneg_f32(float32x2_t a
) // VNE//d0,d0
14686 _NEON2SSE_ALIGN_16
int32_t c80000000
[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14687 res
= _mm_xor_ps (_pM128(a
), *(__m128
*) c80000000
); //use low 64 bits
14692 int8x16_t
vnegq_s8(int8x16_t a
); // VNE//q0,q0
14693 _NEON2SSE_INLINE int8x16_t
vnegq_s8(int8x16_t a
) // VNE//q0,q0
14696 zero
= _mm_setzero_si128 ();
14697 return _mm_sub_epi8 (zero
, a
);
14698 } //or _mm_sign_epi8 (a, negative numbers vector)
14700 int16x8_t
vnegq_s16(int16x8_t a
); // VNE//q0,q0
14701 _NEON2SSE_INLINE int16x8_t
vnegq_s16(int16x8_t a
) // VNE//q0,q0
14704 zero
= _mm_setzero_si128 ();
14705 return _mm_sub_epi16 (zero
, a
);
14706 } //or _mm_sign_epi16 (a, negative numbers vector)
14708 int32x4_t
vnegq_s32(int32x4_t a
); // VNE//q0,q0
14709 _NEON2SSE_INLINE int32x4_t
vnegq_s32(int32x4_t a
) // VNE//q0,q0
14712 zero
= _mm_setzero_si128 ();
14713 return _mm_sub_epi32 (zero
, a
);
14714 } //or _mm_sign_epi32 (a, negative numbers vector)
14716 float32x4_t
vnegq_f32(float32x4_t a
); // VNE//q0,q0
14717 _NEON2SSE_INLINE float32x4_t
vnegq_f32(float32x4_t a
) // VNE//q0,q0
14719 _NEON2SSE_ALIGN_16
int32_t c80000000
[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
14720 return _mm_xor_ps (a
, *(__m128
*) c80000000
);
14723 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
14724 //***************************************************************************************
14725 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
14726 int8x8_t
vqneg_s8(int8x8_t a
); // VQNE//d0,d0
14727 _NEON2SSE_INLINE int8x8_t
vqneg_s8(int8x8_t a
)
14731 res
= vqnegq_s8(_pM128i(a
));
14735 int16x4_t
vqneg_s16(int16x4_t a
); // VQNE//d0,d0
14736 _NEON2SSE_INLINE int16x4_t
vqneg_s16(int16x4_t a
)
14740 res
= vqnegq_s16(_pM128i(a
));
14744 int32x2_t
vqneg_s32(int32x2_t a
); // VQNE//d0,d0
14745 _NEON2SSE_INLINE int32x2_t
vqneg_s32(int32x2_t a
)
14749 res
= vqnegq_s32(_pM128i(a
));
14753 int8x16_t
vqnegq_s8(int8x16_t a
); // VQNE//q0,q0
14754 _NEON2SSE_INLINE int8x16_t
vqnegq_s8(int8x16_t a
) // VQNE//q0,q0
14757 zero
= _mm_setzero_si128 ();
14758 return _mm_subs_epi8 (zero
, a
); //saturating substraction
14761 int16x8_t
vqnegq_s16(int16x8_t a
); // VQNE//q0,q0
14762 _NEON2SSE_INLINE int16x8_t
vqnegq_s16(int16x8_t a
) // VQNE//q0,q0
14765 zero
= _mm_setzero_si128 ();
14766 return _mm_subs_epi16 (zero
, a
); //saturating substraction
14769 int32x4_t
vqnegq_s32(int32x4_t a
); // VQNE//q0,q0
14770 _NEON2SSE_INLINE int32x4_t
vqnegq_s32(int32x4_t a
) // VQNE//q0,q0
14772 //solution may be not optimal compared with a serial
14773 __m128i c80000000
, zero
, sub
, cmp
;
14774 c80000000
= _mm_set1_epi32 (0x80000000); //most negative value
14775 zero
= _mm_setzero_si128 ();
14776 sub
= _mm_sub_epi32 (zero
, a
); //substraction
14777 cmp
= _mm_cmpeq_epi32 (a
, c80000000
);
14778 return _mm_xor_si128 (sub
, cmp
);
14781 //****************** Count leading zeros ********************************
14782 //**************************************************************************
14783 //no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14784 int8x8_t
vclz_s8(int8x8_t a
); // VCLZ.I8 d0,d0
14785 _NEON2SSE_INLINE int8x8_t
vclz_s8(int8x8_t a
)
14789 res
= vclzq_s8(_pM128i(a
));
14793 int16x4_t
vclz_s16(int16x4_t a
); // VCLZ.I16 d0,d0
14794 _NEON2SSE_INLINE int16x4_t
vclz_s16(int16x4_t a
)
14798 res
= vclzq_s16(_pM128i(a
));
14802 int32x2_t
vclz_s32(int32x2_t a
); // VCLZ.I32 d0,d0
14803 _NEON2SSE_INLINE int32x2_t
vclz_s32(int32x2_t a
)
14807 res
= vclzq_s32(_pM128i(a
));
14812 uint8x8_t
vclz_u8(uint8x8_t a
); // VCLZ.I8 d0,d0
14813 #define vclz_u8 vclz_s8
14815 uint16x4_t
vclz_u16(uint16x4_t a
); // VCLZ.I16 d0,d0
14816 #define vclz_u16 vclz_s16
14818 uint32x2_t
vclz_u32(uint32x2_t a
); // VCLZ.I32 d0,d0
14819 #define vclz_u32 vclz_s32
14821 int8x16_t
vclzq_s8(int8x16_t a
); // VCLZ.I8 q0,q0
14822 _NEON2SSE_INLINE int8x16_t
vclzq_s8(int8x16_t a
)
14824 _NEON2SSE_ALIGN_16
int8_t mask_CLZ
[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
14825 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
14826 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
14827 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 };
14828 __m128i maskLOW
, c4
, lowclz
, mask
, hiclz
;
14829 maskLOW
= _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
14830 c4
= _mm_set1_epi8(4);
14831 lowclz
= _mm_shuffle_epi8( *(__m128i
*)mask_CLZ
, a
); //uses low 4 bits anyway
14832 mask
= _mm_srli_epi16(a
, 4); //get high 4 bits as low bits
14833 mask
= _mm_and_si128(mask
, maskLOW
); //low 4 bits, need masking to avoid zero if MSB is set
14834 hiclz
= _mm_shuffle_epi8( *(__m128i
*) mask_CLZ
, mask
); //uses low 4 bits anyway
14835 mask
= _mm_cmpeq_epi8(hiclz
, c4
); // shows the need to add lowclz zeros
14836 lowclz
= _mm_and_si128(lowclz
,mask
);
14837 return _mm_add_epi8(lowclz
, hiclz
);
14840 int16x8_t
vclzq_s16(int16x8_t a
); // VCLZ.I16 q0,q0
14841 _NEON2SSE_INLINE int16x8_t
vclzq_s16(int16x8_t a
)
14843 __m128i c7
, res8x16
, res8x16_swap
;
14844 _NEON2SSE_ALIGN_16
int8_t mask8_sab
[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
14845 _NEON2SSE_ALIGN_16
uint16_t mask8bit
[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
14846 c7
= _mm_srli_epi16(*(__m128i
*)mask8bit
, 5); //7
14847 res8x16
= vclzq_s8(a
);
14848 res8x16_swap
= _mm_shuffle_epi8 (res8x16
, *(__m128i
*) mask8_sab
); //horisontal pairs swap
14849 res8x16
= _mm_and_si128(res8x16
, *(__m128i
*)mask8bit
); //lowclz
14850 res8x16_swap
= _mm_and_si128(res8x16_swap
, *(__m128i
*)mask8bit
); //hiclz
14851 c7
= _mm_cmpgt_epi16(res8x16_swap
, c7
); // shows the need to add lowclz zeros
14852 res8x16
= _mm_and_si128(res8x16
, c7
); //lowclz
14853 return _mm_add_epi16(res8x16_swap
, res8x16
);
14856 int32x4_t
vclzq_s32(int32x4_t a
); // VCLZ.I32 q0,q0
14857 _NEON2SSE_INLINE int32x4_t
vclzq_s32(int32x4_t a
)
14859 __m128i c55555555
, c33333333
, c0f0f0f0f
, c3f
, c32
, tmp
, tmp1
, res
;
14860 c55555555
= _mm_set1_epi32(0x55555555);
14861 c33333333
= _mm_set1_epi32(0x33333333);
14862 c0f0f0f0f
= _mm_set1_epi32(0x0f0f0f0f);
14863 c3f
= _mm_set1_epi32(0x3f);
14864 c32
= _mm_set1_epi32(32);
14865 tmp
= _mm_srli_epi32(a
, 1);
14866 res
= _mm_or_si128(tmp
, a
); //atmp[i] |= (atmp[i] >> 1);
14867 tmp
= _mm_srli_epi32(res
, 2);
14868 res
= _mm_or_si128(tmp
, res
); //atmp[i] |= (atmp[i] >> 2);
14869 tmp
= _mm_srli_epi32(res
, 4);
14870 res
= _mm_or_si128(tmp
, res
); //atmp[i] |= (atmp[i] >> 4);
14871 tmp
= _mm_srli_epi32(res
, 8);
14872 res
= _mm_or_si128(tmp
, res
); //atmp[i] |= (atmp[i] >> 8);
14873 tmp
= _mm_srli_epi32(res
, 16);
14874 res
= _mm_or_si128(tmp
, res
); //atmp[i] |= (atmp[i] >> 16);
14876 tmp
= _mm_srli_epi32(res
, 1);
14877 tmp
= _mm_and_si128(tmp
, c55555555
);
14878 res
= _mm_sub_epi32(res
, tmp
); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
14880 tmp
= _mm_srli_epi32(res
, 2);
14881 tmp
= _mm_and_si128(tmp
, c33333333
);
14882 tmp1
= _mm_and_si128(res
, c33333333
);
14883 res
= _mm_add_epi32(tmp
, tmp1
); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
14885 tmp
= _mm_srli_epi32(res
, 4);
14886 tmp
= _mm_add_epi32(tmp
, res
);
14887 res
= _mm_and_si128(tmp
, c0f0f0f0f
); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
14889 tmp
= _mm_srli_epi32(res
, 8);
14890 res
= _mm_add_epi32(tmp
, res
); //atmp[i] += (atmp[i] >> 8);
14892 tmp
= _mm_srli_epi32(res
, 16);
14893 res
= _mm_add_epi32(tmp
, res
); //atmp[i] += (atmp[i] >> 16);
14895 res
= _mm_and_si128(res
, c3f
); //atmp[i] = atmp[i] & 0x0000003f;
14897 return _mm_sub_epi32(c32
, res
); //res[i] = 32 - atmp[i];
14900 uint8x16_t
vclzq_u8(uint8x16_t a
); // VCLZ.I8 q0,q0
14901 #define vclzq_u8 vclzq_s8
14903 uint16x8_t
vclzq_u16(uint16x8_t a
); // VCLZ.I16 q0,q0
14904 #define vclzq_u16 vclzq_s16
14906 uint32x4_t
vclzq_u32(uint32x4_t a
); // VCLZ.I32 q0,q0
14907 #define vclzq_u32 vclzq_s32
14909 //************** Count leading sign bits **************************
14910 //********************************************************************
14911 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
14912 // the topmost bit, that are the same as the topmost bit, in each element in a vector
14913 //No corresponding vector intrinsics in IA32, need to implement it.
14914 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
14915 int8x8_t
vcls_s8(int8x8_t a
); // VCLS.S8 d0,d0
14916 _NEON2SSE_INLINE int8x8_t
vcls_s8(int8x8_t a
)
14920 res
= vclsq_s8(_pM128i(a
));
14924 int16x4_t
vcls_s16(int16x4_t a
); // VCLS.S16 d0,d0
14925 _NEON2SSE_INLINE int16x4_t
vcls_s16(int16x4_t a
)
14929 res
= vclsq_s16(_pM128i(a
));
14933 int32x2_t
vcls_s32(int32x2_t a
); // VCLS.S32 d0,d0
14934 _NEON2SSE_INLINE int32x2_t
vcls_s32(int32x2_t a
)
14938 res
= vclsq_s32(_pM128i(a
));
14942 int8x16_t
vclsq_s8(int8x16_t a
); // VCLS.S8 q0,q0
14943 _NEON2SSE_INLINE int8x16_t
vclsq_s8(int8x16_t a
)
14945 __m128i cff
, c80
, c1
, a_mask
, a_neg
, a_pos
, a_comb
;
14946 cff
= _mm_cmpeq_epi8 (a
,a
); //0xff
14947 c80
= _mm_set1_epi8(0x80);
14948 c1
= _mm_set1_epi8(1);
14949 a_mask
= _mm_and_si128(a
, c80
);
14950 a_mask
= _mm_cmpeq_epi8(a_mask
, c80
); //0xff if negative input and 0 if positive
14951 a_neg
= _mm_xor_si128(a
, cff
);
14952 a_neg
= _mm_and_si128(a_mask
, a_neg
);
14953 a_pos
= _mm_andnot_si128(a_mask
, a
);
14954 a_comb
= _mm_or_si128(a_pos
, a_neg
);
14955 a_comb
= vclzq_s8(a_comb
);
14956 return _mm_sub_epi8(a_comb
, c1
);
14959 int16x8_t
vclsq_s16(int16x8_t a
); // VCLS.S16 q0,q0
14960 _NEON2SSE_INLINE int16x8_t
vclsq_s16(int16x8_t a
)
14962 __m128i cffff
, c8000
, c1
, a_mask
, a_neg
, a_pos
, a_comb
;
14963 cffff
= _mm_cmpeq_epi16(a
,a
);
14964 c8000
= _mm_slli_epi16(cffff
, 15); //0x8000
14965 c1
= _mm_srli_epi16(cffff
,15); //0x1
14966 a_mask
= _mm_and_si128(a
, c8000
);
14967 a_mask
= _mm_cmpeq_epi16(a_mask
, c8000
); //0xffff if negative input and 0 if positive
14968 a_neg
= _mm_xor_si128(a
, cffff
);
14969 a_neg
= _mm_and_si128(a_mask
, a_neg
);
14970 a_pos
= _mm_andnot_si128(a_mask
, a
);
14971 a_comb
= _mm_or_si128(a_pos
, a_neg
);
14972 a_comb
= vclzq_s16(a_comb
);
14973 return _mm_sub_epi16(a_comb
, c1
);
14976 int32x4_t
vclsq_s32(int32x4_t a
); // VCLS.S32 q0,q0
14977 _NEON2SSE_INLINE int32x4_t
vclsq_s32(int32x4_t a
)
14979 __m128i cffffffff
, c80000000
, c1
, a_mask
, a_neg
, a_pos
, a_comb
;
14980 cffffffff
= _mm_cmpeq_epi32(a
,a
);
14981 c80000000
= _mm_slli_epi32(cffffffff
, 31); //0x80000000
14982 c1
= _mm_srli_epi32(cffffffff
,31); //0x1
14983 a_mask
= _mm_and_si128(a
, c80000000
);
14984 a_mask
= _mm_cmpeq_epi32(a_mask
, c80000000
); //0xffffffff if negative input and 0 if positive
14985 a_neg
= _mm_xor_si128(a
, cffffffff
);
14986 a_neg
= _mm_and_si128(a_mask
, a_neg
);
14987 a_pos
= _mm_andnot_si128(a_mask
, a
);
14988 a_comb
= _mm_or_si128(a_pos
, a_neg
);
14989 a_comb
= vclzq_s32(a_comb
);
14990 return _mm_sub_epi32(a_comb
, c1
);
14993 //************************* Count number of set bits ********************************
14994 //*************************************************************************************
14995 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element
14996 //another option is to do the following algorithm:
14998 uint8x8_t
vcnt_u8(uint8x8_t a
); // VCNT.8 d0,d0
14999 _NEON2SSE_INLINE uint8x8_t
vcnt_u8(uint8x8_t a
)
15003 res
= vcntq_u8(_pM128i(a
));
15007 int8x8_t
vcnt_s8(int8x8_t a
); // VCNT.8 d0,d0
15008 #define vcnt_s8 vcnt_u8
15010 poly8x8_t
vcnt_p8(poly8x8_t a
); // VCNT.8 d0,d0
15011 #define vcnt_p8 vcnt_u8
15013 uint8x16_t
vcntq_u8(uint8x16_t a
); // VCNT.8 q0,q0
15014 _NEON2SSE_INLINE uint8x16_t
vcntq_u8(uint8x16_t a
)
15016 _NEON2SSE_ALIGN_16
int8_t mask_POPCOUNT
[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
15017 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
15018 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
15019 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4 };
15020 __m128i maskLOW
, mask
, lowpopcnt
, hipopcnt
;
15021 maskLOW
= _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
15022 mask
= _mm_and_si128(a
, maskLOW
);
15023 lowpopcnt
= _mm_shuffle_epi8( *(__m128i
*)mask_POPCOUNT
, mask
); //uses low 4 bits anyway
15024 mask
= _mm_srli_epi16(a
, 4); //get high 4 bits as low bits
15025 mask
= _mm_and_si128(mask
, maskLOW
); //low 4 bits, need masking to avoid zero if MSB is set
15026 hipopcnt
= _mm_shuffle_epi8( *(__m128i
*) mask_POPCOUNT
, mask
); //uses low 4 bits anyway
15027 return _mm_add_epi8(lowpopcnt
, hipopcnt
);
15030 int8x16_t
vcntq_s8(int8x16_t a
); // VCNT.8 q0,q0
15031 #define vcntq_s8 vcntq_u8
15033 poly8x16_t
vcntq_p8(poly8x16_t a
); // VCNT.8 q0,q0
15034 #define vcntq_p8 vcntq_u8
15036 //**************************************************************************************
15037 //*********************** Logical operations ****************************************
15038 //**************************************************************************************
15039 //************************** Bitwise not ***********************************
15040 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
15041 int8x8_t
vmvn_s8(int8x8_t a
); // VMVN d0,d0
15042 _NEON2SSE_INLINE int8x8_t
vmvn_s8(int8x8_t a
)
15046 res
= vmvnq_s8(_pM128i(a
));
15050 int16x4_t
vmvn_s16(int16x4_t a
); // VMVN d0,d0
15051 _NEON2SSE_INLINE int16x4_t
vmvn_s16(int16x4_t a
)
15055 res
= vmvnq_s16(_pM128i(a
));
15059 int32x2_t
vmvn_s32(int32x2_t a
); // VMVN d0,d0
15060 _NEON2SSE_INLINE int32x2_t
vmvn_s32(int32x2_t a
)
15064 res
= vmvnq_s32(_pM128i(a
));
15068 uint8x8_t
vmvn_u8(uint8x8_t a
); // VMVN d0,d0
15069 #define vmvn_u8 vmvn_s8
15071 uint16x4_t
vmvn_u16(uint16x4_t a
); // VMVN d0,d0
15072 #define vmvn_u16 vmvn_s16
15074 uint32x2_t
vmvn_u32(uint32x2_t a
); // VMVN d0,d0
15075 #define vmvn_u32 vmvn_s32
15077 poly8x8_t
vmvn_p8(poly8x8_t a
); // VMVN d0,d0
15078 #define vmvn_p8 vmvn_u8
15080 int8x16_t
vmvnq_s8(int8x16_t a
); // VMVN q0,q0
15081 _NEON2SSE_INLINE int8x16_t
vmvnq_s8(int8x16_t a
) // VMVN q0,q0
15084 c1
= _mm_cmpeq_epi8 (a
,a
); //0xff
15085 return _mm_andnot_si128 (a
, c1
);
15088 int16x8_t
vmvnq_s16(int16x8_t a
); // VMVN q0,q0
15089 _NEON2SSE_INLINE int16x8_t
vmvnq_s16(int16x8_t a
) // VMVN q0,q0
15092 c1
= _mm_cmpeq_epi16 (a
,a
); //0xffff
15093 return _mm_andnot_si128 (a
, c1
);
15096 int32x4_t
vmvnq_s32(int32x4_t a
); // VMVN q0,q0
15097 _NEON2SSE_INLINE int32x4_t
vmvnq_s32(int32x4_t a
) // VMVN q0,q0
15100 c1
= _mm_cmpeq_epi32 (a
,a
); //0xffffffff
15101 return _mm_andnot_si128 (a
, c1
);
15104 uint8x16_t
vmvnq_u8(uint8x16_t a
); // VMVN q0,q0
15105 #define vmvnq_u8 vmvnq_s8
15107 uint16x8_t
vmvnq_u16(uint16x8_t a
); // VMVN q0,q0
15108 #define vmvnq_u16 vmvnq_s16
15110 uint32x4_t
vmvnq_u32(uint32x4_t a
); // VMVN q0,q0
15111 #define vmvnq_u32 vmvnq_s32
15113 poly8x16_t
vmvnq_p8(poly8x16_t a
); // VMVN q0,q0
15114 #define vmvnq_p8 vmvnq_u8
15116 //****************** Bitwise and ***********************
15117 //******************************************************
15118 int8x8_t
vand_s8(int8x8_t a
, int8x8_t b
); // VAND d0,d0,d0
15119 _NEON2SSE_INLINE int8x8_t
vand_s8(int8x8_t a
, int8x8_t b
)
15122 return64(_mm_and_si128(_pM128i(a
),_pM128i(b
)));
15125 int16x4_t
vand_s16(int16x4_t a
, int16x4_t b
); // VAND d0,d0,d0
15126 _NEON2SSE_INLINE int16x4_t
vand_s16(int16x4_t a
, int16x4_t b
)
15129 return64(_mm_and_si128(_pM128i(a
),_pM128i(b
)));
15132 int32x2_t
vand_s32(int32x2_t a
, int32x2_t b
); // VAND d0,d0,d0
15133 _NEON2SSE_INLINE int32x2_t
vand_s32(int32x2_t a
, int32x2_t b
)
15136 return64(_mm_and_si128(_pM128i(a
),_pM128i(b
)));
15140 int64x1_t
vand_s64(int64x1_t a
, int64x1_t b
); // VAND d0,d0,d0
15141 _NEON2SSE_INLINE int64x1_t
vand_s64(int64x1_t a
, int64x1_t b
)
15144 res
.m64_i64
[0] = a
.m64_i64
[0] & b
.m64_i64
[0];
15148 uint8x8_t
vand_u8(uint8x8_t a
, uint8x8_t b
); // VAND d0,d0,d0
15149 #define vand_u8 vand_s8
15151 uint16x4_t
vand_u16(uint16x4_t a
, uint16x4_t b
); // VAND d0,d0,d0
15152 #define vand_u16 vand_s16
15154 uint32x2_t
vand_u32(uint32x2_t a
, uint32x2_t b
); // VAND d0,d0,d0
15155 #define vand_u32 vand_s32
15157 uint64x1_t
vand_u64(uint64x1_t a
, uint64x1_t b
); // VAND d0,d0,d0
15158 #define vand_u64 vand_s64
15161 int8x16_t
vandq_s8(int8x16_t a
, int8x16_t b
); // VAND q0,q0,q0
15162 #define vandq_s8 _mm_and_si128
15164 int16x8_t
vandq_s16(int16x8_t a
, int16x8_t b
); // VAND q0,q0,q0
15165 #define vandq_s16 _mm_and_si128
15167 int32x4_t
vandq_s32(int32x4_t a
, int32x4_t b
); // VAND q0,q0,q0
15168 #define vandq_s32 _mm_and_si128
15170 int64x2_t
vandq_s64(int64x2_t a
, int64x2_t b
); // VAND q0,q0,q0
15171 #define vandq_s64 _mm_and_si128
15173 uint8x16_t
vandq_u8(uint8x16_t a
, uint8x16_t b
); // VAND q0,q0,q0
15174 #define vandq_u8 _mm_and_si128
15176 uint16x8_t
vandq_u16(uint16x8_t a
, uint16x8_t b
); // VAND q0,q0,q0
15177 #define vandq_u16 _mm_and_si128
15179 uint32x4_t
vandq_u32(uint32x4_t a
, uint32x4_t b
); // VAND q0,q0,q0
15180 #define vandq_u32 _mm_and_si128
15182 uint64x2_t
vandq_u64(uint64x2_t a
, uint64x2_t b
); // VAND q0,q0,q0
15183 #define vandq_u64 _mm_and_si128
15185 //******************** Bitwise or *********************************
15186 //******************************************************************
15187 int8x8_t
vorr_s8(int8x8_t a
, int8x8_t b
); // VORR d0,d0,d0
15188 _NEON2SSE_INLINE int8x8_t
vorr_s8(int8x8_t a
, int8x8_t b
)
15191 return64(_mm_or_si128(_pM128i(a
),_pM128i(b
)));
15195 int16x4_t
vorr_s16(int16x4_t a
, int16x4_t b
); // VORR d0,d0,d0
15196 _NEON2SSE_INLINE int16x4_t
vorr_s16(int16x4_t a
, int16x4_t b
)
15199 return64(_mm_or_si128(_pM128i(a
),_pM128i(b
)));
15203 int32x2_t
vorr_s32(int32x2_t a
, int32x2_t b
); // VORR d0,d0,d0
15204 _NEON2SSE_INLINE int32x2_t
vorr_s32(int32x2_t a
, int32x2_t b
)
15207 return64(_mm_or_si128(_pM128i(a
),_pM128i(b
)));
15211 int64x1_t
vorr_s64(int64x1_t a
, int64x1_t b
); // VORR d0,d0,d0
15212 _NEON2SSE_INLINE int64x1_t
vorr_s64(int64x1_t a
, int64x1_t b
)
15215 res
.m64_i64
[0] = a
.m64_i64
[0] | b
.m64_i64
[0];
15219 uint8x8_t
vorr_u8(uint8x8_t a
, uint8x8_t b
); // VORR d0,d0,d0
15220 #define vorr_u8 vorr_s8
15222 uint16x4_t
vorr_u16(uint16x4_t a
, uint16x4_t b
); // VORR d0,d0,d0
15223 #define vorr_u16 vorr_s16
15225 uint32x2_t
vorr_u32(uint32x2_t a
, uint32x2_t b
); // VORR d0,d0,d0
15226 #define vorr_u32 vorr_s32
15228 uint64x1_t
vorr_u64(uint64x1_t a
, uint64x1_t b
); // VORR d0,d0,d0
15229 #define vorr_u64 vorr_s64
15231 int8x16_t
vorrq_s8(int8x16_t a
, int8x16_t b
); // VORR q0,q0,q0
15232 #define vorrq_s8 _mm_or_si128
15234 int16x8_t
vorrq_s16(int16x8_t a
, int16x8_t b
); // VORR q0,q0,q0
15235 #define vorrq_s16 _mm_or_si128
15237 int32x4_t
vorrq_s32(int32x4_t a
, int32x4_t b
); // VORR q0,q0,q0
15238 #define vorrq_s32 _mm_or_si128
15240 int64x2_t
vorrq_s64(int64x2_t a
, int64x2_t b
); // VORR q0,q0,q0
15241 #define vorrq_s64 _mm_or_si128
15243 uint8x16_t
vorrq_u8(uint8x16_t a
, uint8x16_t b
); // VORR q0,q0,q0
15244 #define vorrq_u8 _mm_or_si128
15246 uint16x8_t
vorrq_u16(uint16x8_t a
, uint16x8_t b
); // VORR q0,q0,q0
15247 #define vorrq_u16 _mm_or_si128
15249 uint32x4_t
vorrq_u32(uint32x4_t a
, uint32x4_t b
); // VORR q0,q0,q0
15250 #define vorrq_u32 _mm_or_si128
15252 uint64x2_t
vorrq_u64(uint64x2_t a
, uint64x2_t b
); // VORR q0,q0,q0
15253 #define vorrq_u64 _mm_or_si128
15255 //************* Bitwise exclusive or (EOR or XOR) ******************
15256 //*******************************************************************
15257 int8x8_t
veor_s8(int8x8_t a
, int8x8_t b
); // VEOR d0,d0,d0
15258 _NEON2SSE_INLINE int8x8_t
veor_s8(int8x8_t a
, int8x8_t b
)
15261 return64(_mm_xor_si128(_pM128i(a
),_pM128i(b
)));
15264 int16x4_t
veor_s16(int16x4_t a
, int16x4_t b
); // VEOR d0,d0,d0
15265 #define veor_s16 veor_s8
15267 int32x2_t
veor_s32(int32x2_t a
, int32x2_t b
); // VEOR d0,d0,d0
15268 #define veor_s32 veor_s8
15270 int64x1_t
veor_s64(int64x1_t a
, int64x1_t b
); // VEOR d0,d0,d0
15271 _NEON2SSE_INLINE int64x1_t
veor_s64(int64x1_t a
, int64x1_t b
)
15274 res
.m64_i64
[0] = a
.m64_i64
[0] ^ b
.m64_i64
[0];
15278 uint8x8_t
veor_u8(uint8x8_t a
, uint8x8_t b
); // VEOR d0,d0,d0
15279 #define veor_u8 veor_s8
15281 uint16x4_t
veor_u16(uint16x4_t a
, uint16x4_t b
); // VEOR d0,d0,d0
15282 #define veor_u16 veor_s16
15284 uint32x2_t
veor_u32(uint32x2_t a
, uint32x2_t b
); // VEOR d0,d0,d0
15285 #define veor_u32 veor_s32
15287 uint64x1_t
veor_u64(uint64x1_t a
, uint64x1_t b
); // VEOR d0,d0,d0
15288 #define veor_u64 veor_s64
15290 int8x16_t
veorq_s8(int8x16_t a
, int8x16_t b
); // VEOR q0,q0,q0
15291 #define veorq_s8 _mm_xor_si128
15293 int16x8_t
veorq_s16(int16x8_t a
, int16x8_t b
); // VEOR q0,q0,q0
15294 #define veorq_s16 _mm_xor_si128
15296 int32x4_t
veorq_s32(int32x4_t a
, int32x4_t b
); // VEOR q0,q0,q0
15297 #define veorq_s32 _mm_xor_si128
15299 int64x2_t
veorq_s64(int64x2_t a
, int64x2_t b
); // VEOR q0,q0,q0
15300 #define veorq_s64 _mm_xor_si128
15302 uint8x16_t
veorq_u8(uint8x16_t a
, uint8x16_t b
); // VEOR q0,q0,q0
15303 #define veorq_u8 _mm_xor_si128
15305 uint16x8_t
veorq_u16(uint16x8_t a
, uint16x8_t b
); // VEOR q0,q0,q0
15306 #define veorq_u16 _mm_xor_si128
15308 uint32x4_t
veorq_u32(uint32x4_t a
, uint32x4_t b
); // VEOR q0,q0,q0
15309 #define veorq_u32 _mm_xor_si128
15311 uint64x2_t
veorq_u64(uint64x2_t a
, uint64x2_t b
); // VEOR q0,q0,q0
15312 #define veorq_u64 _mm_xor_si128
15314 //********************** Bit Clear **********************************
15315 //*******************************************************************
15316 //Logical AND complement (AND negation or AND NOT)
15317 int8x8_t
vbic_s8(int8x8_t a
, int8x8_t b
); // VBIC d0,d0,d0
15318 _NEON2SSE_INLINE int8x8_t
vbic_s8(int8x8_t a
, int8x8_t b
)
15321 return64(_mm_andnot_si128(_pM128i(b
),_pM128i(a
))); //notice the arguments "swap"
15324 int16x4_t
vbic_s16(int16x4_t a
, int16x4_t b
); // VBIC d0,d0,d0
15325 #define vbic_s16 vbic_s8
15327 int32x2_t
vbic_s32(int32x2_t a
, int32x2_t b
); // VBIC d0,d0,d0
15328 #define vbic_s32 vbic_s8
15330 int64x1_t
vbic_s64(int64x1_t a
, int64x1_t b
); // VBIC d0,d0,d0
15331 _NEON2SSE_INLINE int64x1_t
vbic_s64(int64x1_t a
, int64x1_t b
)
15334 res
.m64_i64
[0] = a
.m64_i64
[0] & (~b
.m64_i64
[0]);
15338 uint8x8_t
vbic_u8(uint8x8_t a
, uint8x8_t b
); // VBIC d0,d0,d0
15339 #define vbic_u8 vbic_s8
15341 uint16x4_t
vbic_u16(uint16x4_t a
, uint16x4_t b
); // VBIC d0,d0,d0
15342 #define vbic_u16 vbic_s16
15344 uint32x2_t
vbic_u32(uint32x2_t a
, uint32x2_t b
); // VBIC d0,d0,d0
15345 #define vbic_u32 vbic_s32
15347 uint64x1_t
vbic_u64(uint64x1_t a
, uint64x1_t b
); // VBIC d0,d0,d0
15348 #define vbic_u64 vbic_s64
15350 int8x16_t
vbicq_s8(int8x16_t a
, int8x16_t b
); // VBIC q0,q0,q0
15351 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15353 int16x8_t
vbicq_s16(int16x8_t a
, int16x8_t b
); // VBIC q0,q0,q0
15354 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15356 int32x4_t
vbicq_s32(int32x4_t a
, int32x4_t b
); // VBIC q0,q0,q0
15357 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15359 int64x2_t
vbicq_s64(int64x2_t a
, int64x2_t b
); // VBIC q0,q0,q0
15360 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15362 uint8x16_t
vbicq_u8(uint8x16_t a
, uint8x16_t b
); // VBIC q0,q0,q0
15363 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15365 uint16x8_t
vbicq_u16(uint16x8_t a
, uint16x8_t b
); // VBIC q0,q0,q0
15366 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15368 uint32x4_t
vbicq_u32(uint32x4_t a
, uint32x4_t b
); // VBIC q0,q0,q0
15369 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15371 uint64x2_t
vbicq_u64(uint64x2_t a
, uint64x2_t b
); // VBIC q0,q0,q0
15372 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
15374 //**************** Bitwise OR complement ********************************
15375 //**************************************** ********************************
15376 //no exact IA 32 match, need to implement it as following
15377 int8x8_t
vorn_s8(int8x8_t a
, int8x8_t b
); // VORN d0,d0,d0
15378 _NEON2SSE_INLINE int8x8_t
vorn_s8(int8x8_t a
, int8x8_t b
)
15381 return64(vornq_s8(_pM128i(a
), _pM128i(b
)));
15385 int16x4_t
vorn_s16(int16x4_t a
, int16x4_t b
); // VORN d0,d0,d0
15386 _NEON2SSE_INLINE int16x4_t
vorn_s16(int16x4_t a
, int16x4_t b
)
15389 return64(vornq_s16(_pM128i(a
), _pM128i(b
)));
15393 int32x2_t
vorn_s32(int32x2_t a
, int32x2_t b
); // VORN d0,d0,d0
15394 _NEON2SSE_INLINE int32x2_t
vorn_s32(int32x2_t a
, int32x2_t b
)
15397 return64(vornq_s32(_pM128i(a
), _pM128i(b
)));
15401 int64x1_t
vorn_s64(int64x1_t a
, int64x1_t b
); // VORN d0,d0,d0
15402 _NEON2SSE_INLINE int64x1_t
vorn_s64(int64x1_t a
, int64x1_t b
)
15405 res
.m64_i64
[0] = a
.m64_i64
[0] | (~b
.m64_i64
[0]);
15409 uint8x8_t
vorn_u8(uint8x8_t a
, uint8x8_t b
); // VORN d0,d0,d0
15410 #define vorn_u8 vorn_s8
15413 uint16x4_t
vorn_u16(uint16x4_t a
, uint16x4_t b
); // VORN d0,d0,d0
15414 #define vorn_u16 vorn_s16
15416 uint32x2_t
vorn_u32(uint32x2_t a
, uint32x2_t b
); // VORN d0,d0,d0
15417 #define vorn_u32 vorn_s32
15419 uint64x1_t
vorn_u64(uint64x1_t a
, uint64x1_t b
); // VORN d0,d0,d0
15420 #define vorn_u64 vorn_s64
15423 int8x16_t
vornq_s8(int8x16_t a
, int8x16_t b
); // VORN q0,q0,q0
15424 _NEON2SSE_INLINE int8x16_t
vornq_s8(int8x16_t a
, int8x16_t b
) // VORN q0,q0,q0
15427 b1
= vmvnq_s8( b
); //bitwise not for b
15428 return _mm_or_si128 (a
, b1
);
15431 int16x8_t
vornq_s16(int16x8_t a
, int16x8_t b
); // VORN q0,q0,q0
15432 _NEON2SSE_INLINE int16x8_t
vornq_s16(int16x8_t a
, int16x8_t b
) // VORN q0,q0,q0
15435 b1
= vmvnq_s16( b
); //bitwise not for b
15436 return _mm_or_si128 (a
, b1
);
15439 int32x4_t
vornq_s32(int32x4_t a
, int32x4_t b
); // VORN q0,q0,q0
15440 _NEON2SSE_INLINE int32x4_t
vornq_s32(int32x4_t a
, int32x4_t b
) // VORN q0,q0,q0
15443 b1
= vmvnq_s32( b
); //bitwise not for b
15444 return _mm_or_si128 (a
, b1
);
15447 int64x2_t
vornq_s64(int64x2_t a
, int64x2_t b
); // VORN q0,q0,q0
15448 _NEON2SSE_INLINE int64x2_t
vornq_s64(int64x2_t a
, int64x2_t b
)
15451 c1
= _mm_cmpeq_epi8 (a
, a
); //all ones 0xfffffff...fffff
15452 b1
= _mm_andnot_si128 (b
, c1
);
15453 return _mm_or_si128 (a
, b1
);
15456 uint8x16_t
vornq_u8(uint8x16_t a
, uint8x16_t b
); // VORN q0,q0,q0
15457 _NEON2SSE_INLINE uint8x16_t
vornq_u8(uint8x16_t a
, uint8x16_t b
) // VORN q0,q0,q0
15460 b1
= vmvnq_u8( b
); //bitwise not for b
15461 return _mm_or_si128 (a
, b1
);
15464 uint16x8_t
vornq_u16(uint16x8_t a
, uint16x8_t b
); // VORN q0,q0,q0
15465 _NEON2SSE_INLINE uint16x8_t
vornq_u16(uint16x8_t a
, uint16x8_t b
) // VORN q0,q0,q0
15468 b1
= vmvnq_s16( b
); //bitwise not for b
15469 return _mm_or_si128 (a
, b1
);
15472 uint32x4_t
vornq_u32(uint32x4_t a
, uint32x4_t b
); // VORN q0,q0,q0
15473 _NEON2SSE_INLINE uint32x4_t
vornq_u32(uint32x4_t a
, uint32x4_t b
) // VORN q0,q0,q0
15476 b1
= vmvnq_u32( b
); //bitwise not for b
15477 return _mm_or_si128 (a
, b1
);
15479 uint64x2_t
vornq_u64(uint64x2_t a
, uint64x2_t b
); // VORN q0,q0,q0
15480 #define vornq_u64 vornq_s64
15482 //********************* Bitwise Select *****************************
15483 //******************************************************************
15484 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
15486 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
15487 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
15489 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
15490 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
15492 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
15493 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
15495 //VBSL only is implemented for SIMD
15496 int8x8_t
vbsl_s8(uint8x8_t a
, int8x8_t b
, int8x8_t c
); // VBSL d0,d0,d0
15497 _NEON2SSE_INLINE int8x8_t
vbsl_s8(uint8x8_t a
, int8x8_t b
, int8x8_t c
)
15501 res
= vbslq_s8(_pM128i(a
), _pM128i(b
), _pM128i(c
));
15505 int16x4_t
vbsl_s16(uint16x4_t a
, int16x4_t b
, int16x4_t c
); // VBSL d0,d0,d0
15506 #define vbsl_s16 vbsl_s8
15508 int32x2_t
vbsl_s32(uint32x2_t a
, int32x2_t b
, int32x2_t c
); // VBSL d0,d0,d0
15509 #define vbsl_s32 vbsl_s8
15511 int64x1_t
vbsl_s64(uint64x1_t a
, int64x1_t b
, int64x1_t c
); // VBSL d0,d0,d0
15512 _NEON2SSE_INLINE int64x1_t
vbsl_s64(uint64x1_t a
, int64x1_t b
, int64x1_t c
)
15515 res
.m64_i64
[0] = (a
.m64_i64
[0] & b
.m64_i64
[0]) | ( (~a
.m64_i64
[0]) & c
.m64_i64
[0]);
15519 uint8x8_t
vbsl_u8(uint8x8_t a
, uint8x8_t b
, uint8x8_t c
); // VBSL d0,d0,d0
15520 #define vbsl_u8 vbsl_s8
15522 uint16x4_t
vbsl_u16(uint16x4_t a
, uint16x4_t b
, uint16x4_t c
); // VBSL d0,d0,d0
15523 #define vbsl_u16 vbsl_s8
15525 uint32x2_t
vbsl_u32(uint32x2_t a
, uint32x2_t b
, uint32x2_t c
); // VBSL d0,d0,d0
15526 #define vbsl_u32 vbsl_s8
15528 uint64x1_t
vbsl_u64(uint64x1_t a
, uint64x1_t b
, uint64x1_t c
); // VBSL d0,d0,d0
15529 #define vbsl_u64 vbsl_s64
15531 float32x2_t
vbsl_f32(uint32x2_t a
, float32x2_t b
, float32x2_t c
); // VBSL d0,d0,d0
15532 _NEON2SSE_INLINE float32x2_t
vbsl_f32(uint32x2_t a
, float32x2_t b
, float32x2_t c
)
15536 sel1
= _mm_and_ps (_pM128(a
), _pM128(b
));
15537 sel2
= _mm_andnot_ps (_pM128(a
), _pM128(c
));
15538 sel1
= _mm_or_ps (sel1
, sel2
);
15539 _M64f(res64
, sel1
);
15543 poly8x8_t
vbsl_p8(uint8x8_t a
, poly8x8_t b
, poly8x8_t c
); // VBSL d0,d0,d0
15544 #define vbsl_p8 vbsl_s8
15546 poly16x4_t
vbsl_p16(uint16x4_t a
, poly16x4_t b
, poly16x4_t c
); // VBSL d0,d0,d0
15547 #define vbsl_p16 vbsl_s8
15549 int8x16_t
vbslq_s8(uint8x16_t a
, int8x16_t b
, int8x16_t c
); // VBSL q0,q0,q0
15550 _NEON2SSE_INLINE int8x16_t
vbslq_s8(uint8x16_t a
, int8x16_t b
, int8x16_t c
) // VBSL q0,q0,q0
15552 __m128i sel1
, sel2
;
15553 sel1
= _mm_and_si128 (a
, b
);
15554 sel2
= _mm_andnot_si128 (a
, c
);
15555 return _mm_or_si128 (sel1
, sel2
);
15558 int16x8_t
vbslq_s16(uint16x8_t a
, int16x8_t b
, int16x8_t c
); // VBSL q0,q0,q0
15559 #define vbslq_s16 vbslq_s8
15561 int32x4_t
vbslq_s32(uint32x4_t a
, int32x4_t b
, int32x4_t c
); // VBSL q0,q0,q0
15562 #define vbslq_s32 vbslq_s8
15564 int64x2_t
vbslq_s64(uint64x2_t a
, int64x2_t b
, int64x2_t c
); // VBSL q0,q0,q0
15565 #define vbslq_s64 vbslq_s8
15567 uint8x16_t
vbslq_u8(uint8x16_t a
, uint8x16_t b
, uint8x16_t c
); // VBSL q0,q0,q0
15568 #define vbslq_u8 vbslq_s8
15570 uint16x8_t
vbslq_u16(uint16x8_t a
, uint16x8_t b
, uint16x8_t c
); // VBSL q0,q0,q0
15571 #define vbslq_u16 vbslq_s8
15573 uint32x4_t
vbslq_u32(uint32x4_t a
, uint32x4_t b
, uint32x4_t c
); // VBSL q0,q0,q0
15574 #define vbslq_u32 vbslq_s8
15576 uint64x2_t
vbslq_u64(uint64x2_t a
, uint64x2_t b
, uint64x2_t c
); // VBSL q0,q0,q0
15577 #define vbslq_u64 vbslq_s8
15579 float32x4_t
vbslq_f32(uint32x4_t a
, float32x4_t b
, float32x4_t c
); // VBSL q0,q0,q0
15580 _NEON2SSE_INLINE float32x4_t
vbslq_f32(uint32x4_t a
, float32x4_t b
, float32x4_t c
) // VBSL q0,q0,q0
15583 sel1
= _mm_and_ps (*(__m128
*)&a
, b
);
15584 sel2
= _mm_andnot_ps (*(__m128
*)&a
, c
);
15585 return _mm_or_ps (sel1
, sel2
);
15588 poly8x16_t
vbslq_p8(uint8x16_t a
, poly8x16_t b
, poly8x16_t c
); // VBSL q0,q0,q0
15589 #define vbslq_p8 vbslq_u8
15591 poly16x8_t
vbslq_p16(uint16x8_t a
, poly16x8_t b
, poly16x8_t c
); // VBSL q0,q0,q0
15592 #define vbslq_p16 vbslq_s8
15594 //************************************************************************************
15595 //**************** Transposition operations ****************************************
15596 //************************************************************************************
15597 //***************** Vector Transpose ************************************************
15598 //************************************************************************************
15599 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
15600 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
15601 int8x8x2_t
vtrn_s8(int8x8_t a
, int8x8_t b
); // VTRN.8 d0,d0
15602 _NEON2SSE_INLINE int8x8x2_t
vtrn_s8(int8x8_t a
, int8x8_t b
) // VTRN.8 d0,d0
15606 _NEON2SSE_ALIGN_16
int8_t mask16_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
15607 tmp
= _mm_unpacklo_epi8(_pM128i(a
), _pM128i(b
)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15608 val0
= _mm_shuffle_epi8 (tmp
, *(__m128i
*)mask16_even_odd
); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
15609 vst1q_s8 (val
.val
, val0
); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
15613 int16x4x2_t
vtrn_s16(int16x4_t a
, int16x4_t b
); // VTRN.16 d0,d0
15614 _NEON2SSE_INLINE int16x4x2_t
vtrn_s16(int16x4_t a
, int16x4_t b
) // VTRN.16 d0,d0
15618 _NEON2SSE_ALIGN_16
int8_t maskdlv16
[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
15619 tmp
= _mm_unpacklo_epi16(_pM128i(a
), _pM128i(b
)); //a0,b0,a1,b1,a2,b2,a3,b3
15620 val0
= _mm_shuffle_epi8 (tmp
, *(__m128i
*)maskdlv16
); //a0, b0, a2, b2, a1,b1, a3, b3
15621 vst1q_s16(val
.val
, val0
); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
15625 int32x2x2_t
vtrn_s32(int32x2_t a
, int32x2_t b
); // VTRN.32 d0,d0
15626 _NEON2SSE_INLINE int32x2x2_t
vtrn_s32(int32x2_t a
, int32x2_t b
)
15630 val0
= _mm_unpacklo_epi32(_pM128i(a
), _pM128i(b
)); //a0,b0,a1,b1
15631 vst1q_s32(val
.val
, val0
); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
15635 uint8x8x2_t
vtrn_u8(uint8x8_t a
, uint8x8_t b
); // VTRN.8 d0,d0
15636 #define vtrn_u8 vtrn_s8
15638 uint16x4x2_t
vtrn_u16(uint16x4_t a
, uint16x4_t b
); // VTRN.16 d0,d0
15639 #define vtrn_u16 vtrn_s16
15641 uint32x2x2_t
vtrn_u32(uint32x2_t a
, uint32x2_t b
); // VTRN.32 d0,d0
15642 #define vtrn_u32 vtrn_s32
15644 float32x2x2_t
vtrn_f32(float32x2_t a
, float32x2_t b
); // VTRN.32 d0,d0
15645 _NEON2SSE_INLINE float32x2x2_t
vtrn_f32(float32x2_t a
, float32x2_t b
)
15648 val
.val
[0].m64_f32
[0] = a
.m64_f32
[0];
15649 val
.val
[0].m64_f32
[1] = b
.m64_f32
[0];
15650 val
.val
[1].m64_f32
[0] = a
.m64_f32
[1];
15651 val
.val
[1].m64_f32
[1] = b
.m64_f32
[1];
15652 return val
; //a0,b0,a1,b1
15655 poly8x8x2_t
vtrn_p8(poly8x8_t a
, poly8x8_t b
); // VTRN.8 d0,d0
15656 #define vtrn_p8 vtrn_u8
15658 poly16x4x2_t
vtrn_p16(poly16x4_t a
, poly16x4_t b
); // VTRN.16 d0,d0
15659 #define vtrn_p16 vtrn_s16
15661 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
15662 _NEON2SSE_INLINE int8x16x2_t
vtrnq_s8(int8x16_t a
, int8x16_t b
) // VTRN.8 q0,q0
15665 __m128i a_sh
, b_sh
;
15666 _NEON2SSE_ALIGN_16
int8_t mask8_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
15667 a_sh
= _mm_shuffle_epi8 (a
, *(__m128i
*)mask8_even_odd
); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15668 b_sh
= _mm_shuffle_epi8 (b
, *(__m128i
*)mask8_even_odd
); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15670 r8x16
.val
[0] = _mm_unpacklo_epi8(a_sh
, b_sh
); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
15671 r8x16
.val
[1] = _mm_unpackhi_epi8(a_sh
, b_sh
); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
15675 int16x8x2_t
vtrnq_s16(int16x8_t a
, int16x8_t b
); // VTRN.16 q0,q0
15676 _NEON2SSE_INLINE int16x8x2_t
vtrnq_s16(int16x8_t a
, int16x8_t b
) // VTRN.16 q0,q0
15679 __m128i a_sh
, b_sh
;
15680 _NEON2SSE_ALIGN_16
int8_t mask16_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
15681 a_sh
= _mm_shuffle_epi8 (a
, *(__m128i
*)mask16_even_odd
); //a0, a2, a4, a6, a1, a3, a5, a7
15682 b_sh
= _mm_shuffle_epi8 (b
, *(__m128i
*)mask16_even_odd
); //b0, b2, b4, b6, b1, b3, b5, b7
15683 v16x8
.val
[0] = _mm_unpacklo_epi16(a_sh
, b_sh
); //a0, b0, a2, b2, a4, b4, a6, b6
15684 v16x8
.val
[1] = _mm_unpackhi_epi16(a_sh
, b_sh
); //a1, b1, a3, b3, a5, b5, a7, b7
15688 int32x4x2_t
vtrnq_s32(int32x4_t a
, int32x4_t b
); // VTRN.32 q0,q0
15689 _NEON2SSE_INLINE int32x4x2_t
vtrnq_s32(int32x4_t a
, int32x4_t b
) // VTRN.32 q0,q0
15691 //may be not optimal solution compared with serial
15693 __m128i a_sh
, b_sh
;
15694 a_sh
= _mm_shuffle_epi32 (a
, 216); //a0, a2, a1, a3
15695 b_sh
= _mm_shuffle_epi32 (b
, 216); //b0, b2, b1, b3
15697 v32x4
.val
[0] = _mm_unpacklo_epi32(a_sh
, b_sh
); //a0, b0, a2, b2
15698 v32x4
.val
[1] = _mm_unpackhi_epi32(a_sh
, b_sh
); //a1, b1, a3, b3
15702 uint8x16x2_t
vtrnq_u8(uint8x16_t a
, uint8x16_t b
); // VTRN.8 q0,q0
15703 #define vtrnq_u8 vtrnq_s8
15705 uint16x8x2_t
vtrnq_u16(uint16x8_t a
, uint16x8_t b
); // VTRN.16 q0,q0
15706 #define vtrnq_u16 vtrnq_s16
15708 uint32x4x2_t
vtrnq_u32(uint32x4_t a
, uint32x4_t b
); // VTRN.32 q0,q0
15709 #define vtrnq_u32 vtrnq_s32
15711 float32x4x2_t
vtrnq_f32(float32x4_t a
, float32x4_t b
); // VTRN.32 q0,q0
15712 _NEON2SSE_INLINE float32x4x2_t
vtrnq_f32(float32x4_t a
, float32x4_t b
) // VTRN.32 q0,q0
15714 //may be not optimal solution compared with serial
15715 float32x4x2_t f32x4
;
15717 a_sh
= _mm_shuffle_ps (a
, a
, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
15718 b_sh
= _mm_shuffle_ps (b
, b
, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
15720 f32x4
.val
[0] = _mm_unpacklo_ps(a_sh
, b_sh
); //a0, b0, a2, b2
15721 f32x4
.val
[1] = _mm_unpackhi_ps(a_sh
, b_sh
); //a1, b1, a3, b3
15725 poly8x16x2_t
vtrnq_p8(poly8x16_t a
, poly8x16_t b
); // VTRN.8 q0,q0
15726 #define vtrnq_p8 vtrnq_s8
15728 poly16x8x2_t
vtrnq_p16(poly16x8_t a
, poly16x8_t b
); // VTRN.16 q0,q0
15729 #define vtrnq_p16 vtrnq_s16
15731 //***************** Interleave elements ***************************
15732 //*****************************************************************
15733 //output has (a0,b0,a1,b1, a2,b2,.....)
15734 int8x8x2_t
vzip_s8(int8x8_t a
, int8x8_t b
); // VZIP.8 d0,d0
15735 _NEON2SSE_INLINE int8x8x2_t
vzip_s8(int8x8_t a
, int8x8_t b
) // VZIP.8 d0,d0
15739 val0
= _mm_unpacklo_epi8(_pM128i(a
), _pM128i(b
));
15740 vst1q_s8(val
.val
, val0
); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15744 int16x4x2_t
vzip_s16(int16x4_t a
, int16x4_t b
); // VZIP.16 d0,d0
15745 _NEON2SSE_INLINE int16x4x2_t
vzip_s16(int16x4_t a
, int16x4_t b
) // VZIP.16 d0,d0
15749 val0
= _mm_unpacklo_epi16(_pM128i(a
), _pM128i(b
));
15750 vst1q_s16(val
.val
, val0
); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15754 int32x2x2_t
vzip_s32(int32x2_t a
, int32x2_t b
); // VZIP.32 d0,d0
15755 #define vzip_s32 vtrn_s32
15757 uint8x8x2_t
vzip_u8(uint8x8_t a
, uint8x8_t b
); // VZIP.8 d0,d0
15758 #define vzip_u8 vzip_s8
15760 uint16x4x2_t
vzip_u16(uint16x4_t a
, uint16x4_t b
); // VZIP.16 d0,d0
15761 #define vzip_u16 vzip_s16
15763 uint32x2x2_t
vzip_u32(uint32x2_t a
, uint32x2_t b
); // VZIP.32 d0,d0
15764 #define vzip_u32 vzip_s32
15766 float32x2x2_t
vzip_f32(float32x2_t a
, float32x2_t b
); // VZIP.32 d0,d0
15767 #define vzip_f32 vtrn_f32
15769 poly8x8x2_t
vzip_p8(poly8x8_t a
, poly8x8_t b
); // VZIP.8 d0,d0
15770 #define vzip_p8 vzip_u8
15772 poly16x4x2_t
vzip_p16(poly16x4_t a
, poly16x4_t b
); // VZIP.16 d0,d0
15773 #define vzip_p16 vzip_u16
15775 int8x16x2_t
vzipq_s8(int8x16_t a
, int8x16_t b
); // VZIP.8 q0,q0
15776 _NEON2SSE_INLINE int8x16x2_t
vzipq_s8(int8x16_t a
, int8x16_t b
) // VZIP.8 q0,q0
15779 r8x16
.val
[0] = _mm_unpacklo_epi8(a
, b
);
15780 r8x16
.val
[1] = _mm_unpackhi_epi8(a
, b
);
15784 int16x8x2_t
vzipq_s16(int16x8_t a
, int16x8_t b
); // VZIP.16 q0,q0
15785 _NEON2SSE_INLINE int16x8x2_t
vzipq_s16(int16x8_t a
, int16x8_t b
) // VZIP.16 q0,q0
15788 r16x8
.val
[0] = _mm_unpacklo_epi16(a
, b
);
15789 r16x8
.val
[1] = _mm_unpackhi_epi16(a
, b
);
15793 int32x4x2_t
vzipq_s32(int32x4_t a
, int32x4_t b
); // VZIP.32 q0,q0
15794 _NEON2SSE_INLINE int32x4x2_t
vzipq_s32(int32x4_t a
, int32x4_t b
) // VZIP.32 q0,q0
15797 r32x4
.val
[0] = _mm_unpacklo_epi32(a
, b
);
15798 r32x4
.val
[1] = _mm_unpackhi_epi32(a
, b
);
15802 uint8x16x2_t
vzipq_u8(uint8x16_t a
, uint8x16_t b
); // VZIP.8 q0,q0
15803 #define vzipq_u8 vzipq_s8
15805 uint16x8x2_t
vzipq_u16(uint16x8_t a
, uint16x8_t b
); // VZIP.16 q0,q0
15806 #define vzipq_u16 vzipq_s16
15808 uint32x4x2_t
vzipq_u32(uint32x4_t a
, uint32x4_t b
); // VZIP.32 q0,q0
15809 #define vzipq_u32 vzipq_s32
15811 float32x4x2_t
vzipq_f32(float32x4_t a
, float32x4_t b
); // VZIP.32 q0,q0
15812 _NEON2SSE_INLINE float32x4x2_t
vzipq_f32(float32x4_t a
, float32x4_t b
) // VZIP.32 q0,q0
15814 float32x4x2_t f32x4
;
15815 f32x4
.val
[0] = _mm_unpacklo_ps ( a
, b
);
15816 f32x4
.val
[1] = _mm_unpackhi_ps ( a
, b
);
15820 poly8x16x2_t
vzipq_p8(poly8x16_t a
, poly8x16_t b
); // VZIP.8 q0,q0
15821 #define vzipq_p8 vzipq_u8
15823 poly16x8x2_t
vzipq_p16(poly16x8_t a
, poly16x8_t b
); // VZIP.16 q0,q0
15824 #define vzipq_p16 vzipq_u16
15826 //*********************** De-Interleave elements *************************
15827 //*************************************************************************
15828 //As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
15829 //no such functions in IA32 SIMD, shuffle is required
15830 int8x8x2_t
vuzp_s8(int8x8_t a
, int8x8_t b
); // VUZP.8 d0,d0
15831 _NEON2SSE_INLINE int8x8x2_t
vuzp_s8(int8x8_t a
, int8x8_t b
) // VUZP.8 d0,d0
15835 _NEON2SSE_ALIGN_16
int8_t maskdlv8
[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15};
15836 tmp
= _mm_unpacklo_epi8(_pM128i(a
), _pM128i(b
)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
15837 val0
= _mm_shuffle_epi8 (tmp
, *(__m128i
*)maskdlv8
); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7)
15838 vst1q_s8(val
.val
, val0
); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15842 int16x4x2_t
vuzp_s16(int16x4_t a
, int16x4_t b
); // VUZP.16 d0,d0
15843 _NEON2SSE_INLINE int16x4x2_t
vuzp_s16(int16x4_t a
, int16x4_t b
) // VUZP.16 d0,d0
15847 _NEON2SSE_ALIGN_16
int8_t maskdlv16
[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};
15848 tmp
= _mm_unpacklo_epi16(_pM128i(a
), _pM128i(b
)); //a0,b0,a1,b1,a2,b2,a3,b3
15849 val0
= _mm_shuffle_epi8 (tmp
, *(__m128i
*)maskdlv16
); //a0,a2, b0, b2, a1,a3, b1,b3
15850 vst1q_s16(val
.val
, val0
); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15854 int32x2x2_t
vuzp_s32(int32x2_t a
, int32x2_t b
); // VUZP.32 d0,d0
15855 _NEON2SSE_INLINE int32x2x2_t
vuzp_s32(int32x2_t a
, int32x2_t b
) // VUZP.32 d0,d0
15859 val0
= _mm_unpacklo_epi32(_pM128i(a
), _pM128i(b
)); //a0,b0, a1,b1
15860 vst1q_s32(val
.val
, val0
); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
15864 uint8x8x2_t
vuzp_u8(uint8x8_t a
, uint8x8_t b
); // VUZP.8 d0,d0
15865 #define vuzp_u8 vuzp_s8
15867 uint16x4x2_t
vuzp_u16(uint16x4_t a
, uint16x4_t b
); // VUZP.16 d0,d0
15868 #define vuzp_u16 vuzp_s16
15870 uint32x2x2_t
vuzp_u32(uint32x2_t a
, uint32x2_t b
); // VUZP.32 d0,d0
15871 #define vuzp_u32 vuzp_s32
15873 float32x2x2_t
vuzp_f32(float32x2_t a
, float32x2_t b
); // VUZP.32 d0,d0
15874 #define vuzp_f32 vzip_f32
15876 poly8x8x2_t
vuzp_p8(poly8x8_t a
, poly8x8_t b
); // VUZP.8 d0,d0
15877 #define vuzp_p8 vuzp_u8
15879 poly16x4x2_t
vuzp_p16(poly16x4_t a
, poly16x4_t b
); // VUZP.16 d0,d0
15880 #define vuzp_p16 vuzp_u16
15882 int8x16x2_t
vuzpq_s8(int8x16_t a
, int8x16_t b
); // VUZP.8 q0,q0
15883 _NEON2SSE_INLINE int8x16x2_t
vuzpq_s8(int8x16_t a
, int8x16_t b
) // VUZP.8 q0,q0
15886 __m128i a_sh
, b_sh
;
15887 _NEON2SSE_ALIGN_16
int8_t mask8_even_odd
[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
15888 a_sh
= _mm_shuffle_epi8 (a
, *(__m128i
*)mask8_even_odd
); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
15889 b_sh
= _mm_shuffle_epi8 (b
, *(__m128i
*)mask8_even_odd
); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
15890 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
15891 v8x16
.val
[0] = _mm_unpacklo_epi64(a_sh
, b_sh
); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14,
15892 v8x16
.val
[1] = _mm_unpackhi_epi64(a_sh
, b_sh
); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15
15896 int16x8x2_t
vuzpq_s16(int16x8_t a
, int16x8_t b
); // VUZP.16 q0,q0
15897 _NEON2SSE_INLINE int16x8x2_t
vuzpq_s16(int16x8_t a
, int16x8_t b
) // VUZP.16 q0,q0
15900 __m128i a_sh
, b_sh
;
15901 _NEON2SSE_ALIGN_16
int8_t mask16_even_odd
[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
15902 a_sh
= _mm_shuffle_epi8 (a
, *(__m128i
*)mask16_even_odd
); //a0, a2, a4, a6, a1, a3, a5, a7
15903 b_sh
= _mm_shuffle_epi8 (b
, *(__m128i
*)mask16_even_odd
); //b0, b2, b4, b6, b1, b3, b5, b7
15904 v16x8
.val
[0] = _mm_unpacklo_epi64(a_sh
, b_sh
); //a0, a2, a4, a6, b0, b2, b4, b6
15905 v16x8
.val
[1] = _mm_unpackhi_epi64(a_sh
, b_sh
); //a1, a3, a5, a7, b1, b3, b5, b7
15909 int32x4x2_t
vuzpq_s32(int32x4_t a
, int32x4_t b
); // VUZP.32 q0,q0
15910 _NEON2SSE_INLINE int32x4x2_t
vuzpq_s32(int32x4_t a
, int32x4_t b
) // VUZP.32 q0,q0
15912 //may be not optimal solution compared with serial
15914 __m128i a_sh
, b_sh
;
15915 a_sh
= _mm_shuffle_epi32 (a
, 216); //a0, a2, a1, a3
15916 b_sh
= _mm_shuffle_epi32 (b
, 216); //b0, b2, b1, b3
15918 v32x4
.val
[0] = _mm_unpacklo_epi64(a_sh
, b_sh
); //a0, a2, b0, b2
15919 v32x4
.val
[1] = _mm_unpackhi_epi64(a_sh
, b_sh
); //a1, a3, b1, b3
15923 uint8x16x2_t
vuzpq_u8(uint8x16_t a
, uint8x16_t b
); // VUZP.8 q0,q0
15924 #define vuzpq_u8 vuzpq_s8
15926 uint16x8x2_t
vuzpq_u16(uint16x8_t a
, uint16x8_t b
); // VUZP.16 q0,q0
15927 #define vuzpq_u16 vuzpq_s16
15929 uint32x4x2_t
vuzpq_u32(uint32x4_t a
, uint32x4_t b
); // VUZP.32 q0,q0
15930 #define vuzpq_u32 vuzpq_s32
15932 float32x4x2_t
vuzpq_f32(float32x4_t a
, float32x4_t b
); // VUZP.32 q0,q0
15933 _NEON2SSE_INLINE float32x4x2_t
vuzpq_f32(float32x4_t a
, float32x4_t b
) // VUZP.32 q0,q0
15935 float32x4x2_t v32x4
;
15936 v32x4
.val
[0] = _mm_shuffle_ps(a
, b
, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
15937 v32x4
.val
[1] = _mm_shuffle_ps(a
, b
, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
15941 poly8x16x2_t
vuzpq_p8(poly8x16_t a
, poly8x16_t b
); // VUZP.8 q0,q0
15942 #define vuzpq_p8 vuzpq_u8
15944 poly16x8x2_t
vuzpq_p16(poly16x8_t a
, poly16x8_t b
); // VUZP.16 q0,q0
15945 #define vuzpq_p16 vuzpq_u16
15947 //##############################################################################################
15948 //*********************** Reinterpret cast intrinsics.******************************************
15949 //##############################################################################################
15950 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
15951 poly8x8_t
vreinterpret_p8_u32 (uint32x2_t t
);
15952 #define vreinterpret_p8_u32
15954 poly8x8_t
vreinterpret_p8_u16 (uint16x4_t t
);
15955 #define vreinterpret_p8_u16
15957 poly8x8_t
vreinterpret_p8_u8 (uint8x8_t t
);
15958 #define vreinterpret_p8_u8
15960 poly8x8_t
vreinterpret_p8_s32 (int32x2_t t
);
15961 #define vreinterpret_p8_s32
15963 poly8x8_t
vreinterpret_p8_s16 (int16x4_t t
);
15964 #define vreinterpret_p8_s16
15966 poly8x8_t
vreinterpret_p8_s8 (int8x8_t t
);
15967 #define vreinterpret_p8_s8
15969 poly8x8_t
vreinterpret_p8_u64 (uint64x1_t t
);
15970 #define vreinterpret_p8_u64
15972 poly8x8_t
vreinterpret_p8_s64 (int64x1_t t
);
15973 #define vreinterpret_p8_s64
15975 poly8x8_t
vreinterpret_p8_f32 (float32x2_t t
);
15976 #define vreinterpret_p8_f32
15978 poly8x8_t
vreinterpret_p8_p16 (poly16x4_t t
);
15979 #define vreinterpret_p8_p16
15981 poly8x16_t
vreinterpretq_p8_u32 (uint32x4_t t
);
15982 #define vreinterpretq_p8_u32
15984 poly8x16_t
vreinterpretq_p8_u16 (uint16x8_t t
);
15985 #define vreinterpretq_p8_u16
15987 poly8x16_t
vreinterpretq_p8_u8 (uint8x16_t t
);
15988 #define vreinterpretq_p8_u8
15990 poly8x16_t
vreinterpretq_p8_s32 (int32x4_t t
);
15991 #define vreinterpretq_p8_s32
15993 poly8x16_t
vreinterpretq_p8_s16 (int16x8_t t
);
15994 #define vreinterpretq_p8_s16
15996 poly8x16_t
vreinterpretq_p8_s8 (int8x16_t t
);
15997 #define vreinterpretq_p8_s8
15999 poly8x16_t
vreinterpretq_p8_u64 (uint64x2_t t
);
16000 #define vreinterpretq_p8_u64
16002 poly8x16_t
vreinterpretq_p8_s64 (int64x2_t t
);
16003 #define vreinterpretq_p8_s64
16005 poly8x16_t
vreinterpretq_p8_f32 (float32x4_t t
);
16006 #define vreinterpretq_p8_f32(t) _M128i(t)
16008 poly8x16_t
vreinterpretq_p8_p16 (poly16x8_t t
);
16009 #define vreinterpretq_p8_p16
16011 poly16x4_t
vreinterpret_p16_u32 (uint32x2_t t
);
16012 #define vreinterpret_p16_u32
16014 poly16x4_t
vreinterpret_p16_u16 (uint16x4_t t
);
16015 #define vreinterpret_p16_u16
16017 poly16x4_t
vreinterpret_p16_u8 (uint8x8_t t
);
16018 #define vreinterpret_p16_u8
16020 poly16x4_t
vreinterpret_p16_s32 (int32x2_t t
);
16021 #define vreinterpret_p16_s32
16023 poly16x4_t
vreinterpret_p16_s16 (int16x4_t t
);
16024 #define vreinterpret_p16_s16
16026 poly16x4_t
vreinterpret_p16_s8 (int8x8_t t
);
16027 #define vreinterpret_p16_s8
16029 poly16x4_t
vreinterpret_p16_u64 (uint64x1_t t
);
16030 #define vreinterpret_p16_u64
16032 poly16x4_t
vreinterpret_p16_s64 (int64x1_t t
);
16033 #define vreinterpret_p16_s64
16035 poly16x4_t
vreinterpret_p16_f32 (float32x2_t t
);
16036 #define vreinterpret_p16_f32
16038 poly16x4_t
vreinterpret_p16_p8 (poly8x8_t t
);
16039 #define vreinterpret_p16_p8
16041 poly16x8_t
vreinterpretq_p16_u32 (uint32x4_t t
);
16042 #define vreinterpretq_p16_u32
16044 poly16x8_t
vreinterpretq_p16_u16 (uint16x8_t t
);
16045 #define vreinterpretq_p16_u16
16047 poly16x8_t
vreinterpretq_p16_s32 (int32x4_t t
);
16048 #define vreinterpretq_p16_s32
16050 poly16x8_t
vreinterpretq_p16_s16 (int16x8_t t
);
16051 #define vreinterpretq_p16_s16
16053 poly16x8_t
vreinterpretq_p16_s8 (int8x16_t t
);
16054 #define vreinterpretq_p16_s8
16056 poly16x8_t
vreinterpretq_p16_u64 (uint64x2_t t
);
16057 #define vreinterpretq_p16_u64
16059 poly16x8_t
vreinterpretq_p16_s64 (int64x2_t t
);
16060 #define vreinterpretq_p16_s64
16062 poly16x8_t
vreinterpretq_p16_f32 (float32x4_t t
);
16063 #define vreinterpretq_p16_f32(t) _M128i(t)
16065 poly16x8_t
vreinterpretq_p16_p8 (poly8x16_t t
);
16066 #define vreinterpretq_p16_p8 vreinterpretq_s16_p8
16068 //**** Integer to float ******
16069 float32x2_t
vreinterpret_f32_u32 (uint32x2_t t
);
16070 #define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
16073 float32x2_t
vreinterpret_f32_u16 (uint16x4_t t
);
16074 #define vreinterpret_f32_u16 vreinterpret_f32_u32
16077 float32x2_t
vreinterpret_f32_u8 (uint8x8_t t
);
16078 #define vreinterpret_f32_u8 vreinterpret_f32_u32
16081 float32x2_t
vreinterpret_f32_s32 (int32x2_t t
);
16082 #define vreinterpret_f32_s32 vreinterpret_f32_u32
16085 float32x2_t
vreinterpret_f32_s16 (int16x4_t t
);
16086 #define vreinterpret_f32_s16 vreinterpret_f32_u32
16088 float32x2_t
vreinterpret_f32_s8 (int8x8_t t
);
16089 #define vreinterpret_f32_s8 vreinterpret_f32_u32
16092 float32x2_t
vreinterpret_f32_u64(uint64x1_t t
);
16093 #define vreinterpret_f32_u64 vreinterpret_f32_u32
16096 float32x2_t
vreinterpret_f32_s64 (int64x1_t t
);
16097 #define vreinterpret_f32_s64 vreinterpret_f32_u32
16100 float32x2_t
vreinterpret_f32_p16 (poly16x4_t t
);
16101 #define vreinterpret_f32_p16 vreinterpret_f32_u32
16103 float32x2_t
vreinterpret_f32_p8 (poly8x8_t t
);
16104 #define vreinterpret_f32_p8 vreinterpret_f32_u32
16106 float32x4_t
vreinterpretq_f32_u32 (uint32x4_t t
);
16107 #define vreinterpretq_f32_u32(t) *(__m128*)&(t)
16109 float32x4_t
vreinterpretq_f32_u16 (uint16x8_t t
);
16110 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
16112 float32x4_t
vreinterpretq_f32_u8 (uint8x16_t t
);
16113 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
16115 float32x4_t
vreinterpretq_f32_s32 (int32x4_t t
);
16116 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
16118 float32x4_t
vreinterpretq_f32_s16 (int16x8_t t
);
16119 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
16121 float32x4_t
vreinterpretq_f32_s8 (int8x16_t t
);
16122 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
16124 float32x4_t
vreinterpretq_f32_u64 (uint64x2_t t
);
16125 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
16127 float32x4_t
vreinterpretq_f32_s64 (int64x2_t t
);
16128 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
16130 float32x4_t
vreinterpretq_f32_p16 (poly16x8_t t
);
16131 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
16133 float32x4_t
vreinterpretq_f32_p8 (poly8x16_t t
);
16134 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
16136 //*** Integer type conversions ******************
16137 //no conversion necessary for the following functions because it is same data type
16138 int64x1_t
vreinterpret_s64_u32 (uint32x2_t t
);
16139 #define vreinterpret_s64_u32
16141 int64x1_t
vreinterpret_s64_u16 (uint16x4_t t
);
16142 #define vreinterpret_s64_u16
16144 int64x1_t
vreinterpret_s64_u8 (uint8x8_t t
);
16145 #define vreinterpret_s64_u8
16147 int64x1_t
vreinterpret_s64_s32 (int32x2_t t
);
16148 #define vreinterpret_s64_s32
16150 int64x1_t
vreinterpret_s64_s16 (int16x4_t t
);
16151 #define vreinterpret_s64_s16
16153 int64x1_t
vreinterpret_s64_s8 (int8x8_t t
);
16154 #define vreinterpret_s64_s8
16156 int64x1_t
vreinterpret_s64_u64 (uint64x1_t t
);
16157 #define vreinterpret_s64_u64
16159 int64x1_t
vreinterpret_s64_f32 (float32x2_t t
);
16160 #define vreinterpret_s64_f32
16162 int64x1_t
vreinterpret_s64_p16 (poly16x4_t t
);
16163 #define vreinterpret_s64_p16
16165 int64x1_t
vreinterpret_s64_p8 (poly8x8_t t
);
16166 #define vreinterpret_s64_p8
16168 int64x2_t
vreinterpretq_s64_u32 (uint32x4_t t
);
16169 #define vreinterpretq_s64_u32
16171 int64x2_t
vreinterpretq_s64_s16 (uint16x8_t t
);
16172 #define vreinterpretq_s64_s16
16174 int64x2_t
vreinterpretq_s64_u8 (uint8x16_t t
);
16175 #define vreinterpretq_s64_u8
16177 int64x2_t
vreinterpretq_s64_s32 (int32x4_t t
);
16178 #define vreinterpretq_s64_s32
16180 int64x2_t
vreinterpretq_s64_u16 (int16x8_t t
);
16181 #define vreinterpretq_s64_u16
16183 int64x2_t
vreinterpretq_s64_s8 (int8x16_t t
);
16184 #define vreinterpretq_s64_s8
16186 int64x2_t
vreinterpretq_s64_u64 (uint64x2_t t
);
16187 #define vreinterpretq_s64_u64
16189 int64x2_t
vreinterpretq_s64_f32 (float32x4_t t
);
16190 #define vreinterpretq_s64_f32(t) _M128i(t)
16192 int64x2_t
vreinterpretq_s64_p16 (poly16x8_t t
);
16193 #define vreinterpretq_s64_p16
16195 int64x2_t
vreinterpretq_s64_p8 (poly8x16_t t
);
16196 #define vreinterpretq_s64_p8
16198 uint64x1_t
vreinterpret_u64_u32 (uint32x2_t t
);
16199 #define vreinterpret_u64_u32
16201 uint64x1_t
vreinterpret_u64_u16 (uint16x4_t t
);
16202 #define vreinterpret_u64_u16
16204 uint64x1_t
vreinterpret_u64_u8 (uint8x8_t t
);
16205 #define vreinterpret_u64_u8
16207 uint64x1_t
vreinterpret_u64_s32 (int32x2_t t
);
16208 #define vreinterpret_u64_s32
16210 uint64x1_t
vreinterpret_u64_s16 (int16x4_t t
);
16211 #define vreinterpret_u64_s16
16213 uint64x1_t
vreinterpret_u64_s8 (int8x8_t t
);
16214 #define vreinterpret_u64_s8
16216 uint64x1_t
vreinterpret_u64_s64 (int64x1_t t
);
16217 #define vreinterpret_u64_s64
16219 uint64x1_t
vreinterpret_u64_f32 (float32x2_t t
);
16220 #define vreinterpret_u64_f32
16222 uint64x1_t
vreinterpret_u64_p16 (poly16x4_t t
);
16223 #define vreinterpret_u64_p16
16225 uint64x1_t
vreinterpret_u64_p8 (poly8x8_t t
);
16226 #define vreinterpret_u64_p8
16228 uint64x2_t
vreinterpretq_u64_u32 (uint32x4_t t
);
16229 #define vreinterpretq_u64_u32
16231 uint64x2_t
vreinterpretq_u64_u16 (uint16x8_t t
);
16232 #define vreinterpretq_u64_u16
16234 uint64x2_t
vreinterpretq_u64_u8 (uint8x16_t t
);
16235 #define vreinterpretq_u64_u8
16237 uint64x2_t
vreinterpretq_u64_s32 (int32x4_t t
);
16238 #define vreinterpretq_u64_s32
16240 uint64x2_t
vreinterpretq_u64_s16 (int16x8_t t
);
16241 #define vreinterpretq_u64_s16
16243 uint64x2_t
vreinterpretq_u64_s8 (int8x16_t t
);
16244 #define vreinterpretq_u64_s8
16246 uint64x2_t
vreinterpretq_u64_s64 (int64x2_t t
);
16247 #define vreinterpretq_u64_s64
16249 uint64x2_t
vreinterpretq_u64_f32 (float32x4_t t
);
16250 #define vreinterpretq_u64_f32(t) _M128i(t)
16252 uint64x2_t
vreinterpretq_u64_p16 (poly16x8_t t
);
16253 #define vreinterpretq_u64_p16
16255 uint64x2_t
vreinterpretq_u64_p8 (poly8x16_t t
);
16256 #define vreinterpretq_u64_p8
16258 int8x8_t
vreinterpret_s8_u32 (uint32x2_t t
);
16259 #define vreinterpret_s8_u32
16261 int8x8_t
vreinterpret_s8_u16 (uint16x4_t t
);
16262 #define vreinterpret_s8_u16
16264 int8x8_t
vreinterpret_s8_u8 (uint8x8_t t
);
16265 #define vreinterpret_s8_u8
16267 int8x8_t
vreinterpret_s8_s32 (int32x2_t t
);
16268 #define vreinterpret_s8_s32
16270 int8x8_t
vreinterpret_s8_s16 (int16x4_t t
);
16271 #define vreinterpret_s8_s16
16273 int8x8_t
vreinterpret_s8_u64 (uint64x1_t t
);
16274 #define vreinterpret_s8_u64
16276 int8x8_t
vreinterpret_s8_s64 (int64x1_t t
);
16277 #define vreinterpret_s8_s64
16279 int8x8_t
vreinterpret_s8_f32 (float32x2_t t
);
16280 #define vreinterpret_s8_f32
16282 int8x8_t
vreinterpret_s8_p16 (poly16x4_t t
);
16283 #define vreinterpret_s8_p16
16285 int8x8_t
vreinterpret_s8_p8 (poly8x8_t t
);
16286 #define vreinterpret_s8_p8
16288 int8x16_t
vreinterpretq_s8_u32 (uint32x4_t t
);
16289 #define vreinterpretq_s8_u32
16291 int8x16_t
vreinterpretq_s8_u16 (uint16x8_t t
);
16292 #define vreinterpretq_s8_u16
16294 int8x16_t
vreinterpretq_s8_u8 (uint8x16_t t
);
16295 #define vreinterpretq_s8_u8
16297 int8x16_t
vreinterpretq_s8_s32 (int32x4_t t
);
16298 #define vreinterpretq_s8_s32
16300 int8x16_t
vreinterpretq_s8_s16 (int16x8_t t
);
16301 #define vreinterpretq_s8_s16
16303 int8x16_t
vreinterpretq_s8_u64 (uint64x2_t t
);
16304 #define vreinterpretq_s8_u64
16306 int8x16_t
vreinterpretq_s8_s64 (int64x2_t t
);
16307 #define vreinterpretq_s8_s64
16309 int8x16_t
vreinterpretq_s8_f32 (float32x4_t t
);
16310 #define vreinterpretq_s8_f32(t) _M128i(t)
16312 int8x16_t
vreinterpretq_s8_p16 (poly16x8_t t
);
16313 #define vreinterpretq_s8_p16
16315 int8x16_t
vreinterpretq_s8_p8 (poly8x16_t t
);
16316 #define vreinterpretq_s8_p8
16318 int16x4_t
vreinterpret_s16_u32 (uint32x2_t t
);
16319 #define vreinterpret_s16_u32
16321 int16x4_t
vreinterpret_s16_u16 (uint16x4_t t
);
16322 #define vreinterpret_s16_u16
16324 int16x4_t
vreinterpret_s16_u8 (uint8x8_t t
);
16325 #define vreinterpret_s16_u8
16327 int16x4_t
vreinterpret_s16_s32 (int32x2_t t
);
16328 #define vreinterpret_s16_s32
16330 int16x4_t
vreinterpret_s16_s8 (int8x8_t t
);
16331 #define vreinterpret_s16_s8
16333 int16x4_t
vreinterpret_s16_u64 (uint64x1_t t
);
16334 #define vreinterpret_s16_u64
16336 int16x4_t
vreinterpret_s16_s64 (int64x1_t t
);
16337 #define vreinterpret_s16_s64
16339 int16x4_t
vreinterpret_s16_f32 (float32x2_t t
);
16340 #define vreinterpret_s16_f32
16343 int16x4_t
vreinterpret_s16_p16 (poly16x4_t t
);
16344 #define vreinterpret_s16_p16
16346 int16x4_t
vreinterpret_s16_p8 (poly8x8_t t
);
16347 #define vreinterpret_s16_p8
16349 int16x8_t
vreinterpretq_s16_u32 (uint32x4_t t
);
16350 #define vreinterpretq_s16_u32
16352 int16x8_t
vreinterpretq_s16_u16 (uint16x8_t t
);
16353 #define vreinterpretq_s16_u16
16355 int16x8_t
vreinterpretq_s16_u8 (uint8x16_t t
);
16356 #define vreinterpretq_s16_u8
16358 int16x8_t
vreinterpretq_s16_s32 (int32x4_t t
);
16359 #define vreinterpretq_s16_s32
16361 int16x8_t
vreinterpretq_s16_s8 (int8x16_t t
);
16362 #define vreinterpretq_s16_s8
16364 int16x8_t
vreinterpretq_s16_u64 (uint64x2_t t
);
16365 #define vreinterpretq_s16_u64
16367 int16x8_t
vreinterpretq_s16_s64 (int64x2_t t
);
16368 #define vreinterpretq_s16_s64
16370 int16x8_t
vreinterpretq_s16_f32 (float32x4_t t
);
16371 #define vreinterpretq_s16_f32(t) _M128i(t)
16373 int16x8_t
vreinterpretq_s16_p16 (poly16x8_t t
);
16374 #define vreinterpretq_s16_p16
16376 int16x8_t
vreinterpretq_s16_p8 (poly8x16_t t
);
16377 #define vreinterpretq_s16_p8
16379 int32x2_t
vreinterpret_s32_u32 (uint32x2_t t
);
16380 #define vreinterpret_s32_u32
16382 int32x2_t
vreinterpret_s32_u16 (uint16x4_t t
);
16383 #define vreinterpret_s32_u16
16385 int32x2_t
vreinterpret_s32_u8 (uint8x8_t t
);
16386 #define vreinterpret_s32_u8
16388 int32x2_t
vreinterpret_s32_s16 (int16x4_t t
);
16389 #define vreinterpret_s32_s16
16391 int32x2_t
vreinterpret_s32_s8 (int8x8_t t
);
16392 #define vreinterpret_s32_s8
16394 int32x2_t
vreinterpret_s32_u64 (uint64x1_t t
);
16395 #define vreinterpret_s32_u64
16397 int32x2_t
vreinterpret_s32_s64 (int64x1_t t
);
16398 #define vreinterpret_s32_s64
16400 int32x2_t
vreinterpret_s32_f32 (float32x2_t t
);
16401 #define vreinterpret_s32_f32
16403 int32x2_t
vreinterpret_s32_p16 (poly16x4_t t
);
16404 #define vreinterpret_s32_p16
16406 int32x2_t
vreinterpret_s32_p8 (poly8x8_t t
);
16407 #define vreinterpret_s32_p8
16409 int32x4_t
vreinterpretq_s32_u32 (uint32x4_t t
);
16410 #define vreinterpretq_s32_u32
16412 int32x4_t
vreinterpretq_s32_u16 (uint16x8_t t
);
16413 #define vreinterpretq_s32_u16
16415 int32x4_t
vreinterpretq_s32_u8 (uint8x16_t t
);
16416 #define vreinterpretq_s32_u8
16418 int32x4_t
vreinterpretq_s32_s16 (int16x8_t t
);
16419 #define vreinterpretq_s32_s16
16421 int32x4_t
vreinterpretq_s32_s8 (int8x16_t t
);
16422 #define vreinterpretq_s32_s8
16424 int32x4_t
vreinterpretq_s32_u64 (uint64x2_t t
);
16425 #define vreinterpretq_s32_u64
16427 int32x4_t
vreinterpretq_s32_s64 (int64x2_t t
);
16428 #define vreinterpretq_s32_s64
16430 int32x4_t
vreinterpretq_s32_f32 (float32x4_t t
);
16431 #define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t))
16433 int32x4_t
vreinterpretq_s32_p16 (poly16x8_t t
);
16434 #define vreinterpretq_s32_p16
16436 int32x4_t
vreinterpretq_s32_p8 (poly8x16_t t
);
16437 #define vreinterpretq_s32_p8
16439 uint8x8_t
vreinterpret_u8_u32 (uint32x2_t t
);
16440 #define vreinterpret_u8_u32
16442 uint8x8_t
vreinterpret_u8_u16 (uint16x4_t t
);
16443 #define vreinterpret_u8_u16
16445 uint8x8_t
vreinterpret_u8_s32 (int32x2_t t
);
16446 #define vreinterpret_u8_s32
16448 uint8x8_t
vreinterpret_u8_s16 (int16x4_t t
);
16449 #define vreinterpret_u8_s16
16451 uint8x8_t
vreinterpret_u8_s8 (int8x8_t t
);
16452 #define vreinterpret_u8_s8
16454 uint8x8_t
vreinterpret_u8_u64 (uint64x1_t t
);
16455 #define vreinterpret_u8_u64
16457 uint8x8_t
vreinterpret_u8_s64 (int64x1_t t
);
16458 #define vreinterpret_u8_s64
16460 uint8x8_t
vreinterpret_u8_f32 (float32x2_t t
);
16461 #define vreinterpret_u8_f32
16463 uint8x8_t
vreinterpret_u8_p16 (poly16x4_t t
);
16464 #define vreinterpret_u8_p16
16466 uint8x8_t
vreinterpret_u8_p8 (poly8x8_t t
);
16467 #define vreinterpret_u8_p8
16469 uint8x16_t
vreinterpretq_u8_u32 (uint32x4_t t
);
16470 #define vreinterpretq_u8_u32
16472 uint8x16_t
vreinterpretq_u8_u16 (uint16x8_t t
);
16473 #define vreinterpretq_u8_u16
16475 uint8x16_t
vreinterpretq_u8_s32 (int32x4_t t
);
16476 #define vreinterpretq_u8_s32
16478 uint8x16_t
vreinterpretq_u8_s16 (int16x8_t t
);
16479 #define vreinterpretq_u8_s16
16481 uint8x16_t
vreinterpretq_u8_s8 (int8x16_t t
);
16482 #define vreinterpretq_u8_s8
16484 uint8x16_t
vreinterpretq_u8_u64 (uint64x2_t t
);
16485 #define vreinterpretq_u8_u64
16487 uint8x16_t
vreinterpretq_u8_s64 (int64x2_t t
);
16488 #define vreinterpretq_u8_s64
16490 uint8x16_t
vreinterpretq_u8_f32 (float32x4_t t
);
16491 #define vreinterpretq_u8_f32(t) _M128i(t)
16494 uint8x16_t
vreinterpretq_u8_p16 (poly16x8_t t
);
16495 #define vreinterpretq_u8_p16
16497 uint8x16_t
vreinterpretq_u8_p8 (poly8x16_t t
);
16498 #define vreinterpretq_u8_p8
16500 uint16x4_t
vreinterpret_u16_u32 (uint32x2_t t
);
16501 #define vreinterpret_u16_u32
16503 uint16x4_t
vreinterpret_u16_u8 (uint8x8_t t
);
16504 #define vreinterpret_u16_u8
16506 uint16x4_t
vreinterpret_u16_s32 (int32x2_t t
);
16507 #define vreinterpret_u16_s32
16509 uint16x4_t
vreinterpret_u16_s16 (int16x4_t t
);
16510 #define vreinterpret_u16_s16
16512 uint16x4_t
vreinterpret_u16_s8 (int8x8_t t
);
16513 #define vreinterpret_u16_s8
16515 uint16x4_t
vreinterpret_u16_u64 (uint64x1_t t
);
16516 #define vreinterpret_u16_u64
16518 uint16x4_t
vreinterpret_u16_s64 (int64x1_t t
);
16519 #define vreinterpret_u16_s64
16521 uint16x4_t
vreinterpret_u16_f32 (float32x2_t t
);
16522 #define vreinterpret_u16_f32
16524 uint16x4_t
vreinterpret_u16_p16 (poly16x4_t t
);
16525 #define vreinterpret_u16_p16
16527 uint16x4_t
vreinterpret_u16_p8 (poly8x8_t t
);
16528 #define vreinterpret_u16_p8
16530 uint16x8_t
vreinterpretq_u16_u32 (uint32x4_t t
);
16531 #define vreinterpretq_u16_u32
16533 uint16x8_t
vreinterpretq_u16_u8 (uint8x16_t t
);
16534 #define vreinterpretq_u16_u8
16536 uint16x8_t
vreinterpretq_u16_s32 (int32x4_t t
);
16537 #define vreinterpretq_u16_s32
16539 uint16x8_t
vreinterpretq_u16_s16 (int16x8_t t
);
16540 #define vreinterpretq_u16_s16
16542 uint16x8_t
vreinterpretq_u16_s8 (int8x16_t t
);
16543 #define vreinterpretq_u16_s8
16545 uint16x8_t
vreinterpretq_u16_u64 (uint64x2_t t
);
16546 #define vreinterpretq_u16_u64
16548 uint16x8_t
vreinterpretq_u16_s64 (int64x2_t t
);
16549 #define vreinterpretq_u16_s64
16551 uint16x8_t
vreinterpretq_u16_f32 (float32x4_t t
);
16552 #define vreinterpretq_u16_f32(t) _M128i(t)
16554 uint16x8_t
vreinterpretq_u16_p16 (poly16x8_t t
);
16555 #define vreinterpretq_u16_p16
16557 uint16x8_t
vreinterpretq_u16_p8 (poly8x16_t t
);
16558 #define vreinterpretq_u16_p8
16560 uint32x2_t
vreinterpret_u32_u16 (uint16x4_t t
);
16561 #define vreinterpret_u32_u16
16563 uint32x2_t
vreinterpret_u32_u8 (uint8x8_t t
);
16564 #define vreinterpret_u32_u8
16566 uint32x2_t
vreinterpret_u32_s32 (int32x2_t t
);
16567 #define vreinterpret_u32_s32
16569 uint32x2_t
vreinterpret_u32_s16 (int16x4_t t
);
16570 #define vreinterpret_u32_s16
16572 uint32x2_t
vreinterpret_u32_s8 (int8x8_t t
);
16573 #define vreinterpret_u32_s8
16575 uint32x2_t
vreinterpret_u32_u64 (uint64x1_t t
);
16576 #define vreinterpret_u32_u64
16578 uint32x2_t
vreinterpret_u32_s64 (int64x1_t t
);
16579 #define vreinterpret_u32_s64
16581 uint32x2_t
vreinterpret_u32_f32 (float32x2_t t
);
16582 #define vreinterpret_u32_f32
16584 uint32x2_t
vreinterpret_u32_p16 (poly16x4_t t
);
16585 #define vreinterpret_u32_p16
16587 uint32x2_t
vreinterpret_u32_p8 (poly8x8_t t
);
16588 #define vreinterpret_u32_p8
16590 uint32x4_t
vreinterpretq_u32_u16 (uint16x8_t t
);
16591 #define vreinterpretq_u32_u16
16593 uint32x4_t
vreinterpretq_u32_u8 (uint8x16_t t
);
16594 #define vreinterpretq_u32_u8
16596 uint32x4_t
vreinterpretq_u32_s32 (int32x4_t t
);
16597 #define vreinterpretq_u32_s32
16599 uint32x4_t
vreinterpretq_u32_s16 (int16x8_t t
);
16600 #define vreinterpretq_u32_s16
16602 uint32x4_t
vreinterpretq_u32_s8 (int8x16_t t
);
16603 #define vreinterpretq_u32_s8
16605 uint32x4_t
vreinterpretq_u32_u64 (uint64x2_t t
);
16606 #define vreinterpretq_u32_u64
16608 uint32x4_t
vreinterpretq_u32_s64 (int64x2_t t
);
16609 #define vreinterpretq_u32_s64
16611 uint32x4_t
vreinterpretq_u32_f32 (float32x4_t t
);
16612 #define vreinterpretq_u32_f32(t) _M128i(t)
16614 uint32x4_t
vreinterpretq_u32_p16 (poly16x8_t t
);
16615 #define vreinterpretq_u32_p16
16617 uint32x4_t
vreinterpretq_u32_p8 (poly8x16_t t
);
16618 #define vreinterpretq_u32_p8
16620 #endif /* NEON2SSE_H */