1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef _MOZILLA_GFX_SIMD_H_
8 #define _MOZILLA_GFX_SIMD_H_
11 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
12 * if they want access to the SSE2 functions.
15 #ifdef SIMD_COMPILE_SSE2
16 # include <xmmintrin.h>
24 template <typename u8x16_t
>
25 u8x16_t
Load8(const uint8_t* aSource
);
27 template <typename u8x16_t
>
28 u8x16_t
From8(uint8_t a
, uint8_t b
, uint8_t c
, uint8_t d
, uint8_t e
, uint8_t f
,
29 uint8_t g
, uint8_t h
, uint8_t i
, uint8_t j
, uint8_t k
, uint8_t l
,
30 uint8_t m
, uint8_t n
, uint8_t o
, uint8_t p
);
32 template <typename u8x16_t
>
35 template <typename i16x8_t
>
36 i16x8_t
FromI16(int16_t a
, int16_t b
, int16_t c
, int16_t d
, int16_t e
,
37 int16_t f
, int16_t g
, int16_t h
);
39 template <typename u16x8_t
>
40 u16x8_t
FromU16(uint16_t a
, uint16_t b
, uint16_t c
, uint16_t d
, uint16_t e
,
41 uint16_t f
, uint16_t g
, uint16_t h
);
43 template <typename i16x8_t
>
44 i16x8_t
FromI16(int16_t a
);
46 template <typename u16x8_t
>
47 u16x8_t
FromU16(uint16_t a
);
49 template <typename i32x4_t
>
50 i32x4_t
From32(int32_t a
, int32_t b
, int32_t c
, int32_t d
);
52 template <typename i32x4_t
>
53 i32x4_t
From32(int32_t a
);
55 template <typename f32x4_t
>
56 f32x4_t
FromF32(float a
, float b
, float c
, float d
);
58 template <typename f32x4_t
>
59 f32x4_t
FromF32(float a
);
61 // All SIMD backends overload these functions for their SIMD types:
65 // Store 16 bytes to a 16-byte aligned address
66 void Store8(uint8_t* aTarget
, u8x16_t aM
);
69 template<int32_t aNumberOfBits
> i16x8_t
ShiftRight16(i16x8_t aM
);
70 template<int32_t aNumberOfBits
> i32x4_t
ShiftRight32(i32x4_t aM
);
72 i16x8_t
Add16(i16x8_t aM1
, i16x8_t aM2
);
73 i32x4_t
Add32(i32x4_t aM1
, i32x4_t aM2
);
74 i16x8_t
Sub16(i16x8_t aM1
, i16x8_t aM2
);
75 i32x4_t
Sub32(i32x4_t aM1
, i32x4_t aM2
);
76 u8x16_t
Min8(u8x16_t aM1
, iu8x16_t aM2
);
77 u8x16_t
Max8(u8x16_t aM1
, iu8x16_t aM2
);
78 i32x4_t
Min32(i32x4_t aM1
, i32x4_t aM2
);
79 i32x4_t
Max32(i32x4_t aM1
, i32x4_t aM2
);
81 // Truncating i16 -> i16 multiplication
82 i16x8_t
Mul16(i16x8_t aM1
, i16x8_t aM2
);
84 // Long multiplication i16 -> i32
85 // aFactorsA1B1 = (a1[4] b1[4])
86 // aFactorsA2B2 = (a2[4] b2[4])
87 // aProductA = a1 * a2, aProductB = b1 * b2
88 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1
, i16x8_t aFactorsA2B2
,
89 i32x4_t
& aProductA
, i32x4_t
& aProductB
);
91 // Long multiplication + pairwise addition i16 -> i32
92 // See the scalar implementation for specifics.
93 i32x4_t
MulAdd16x8x2To32x4(i16x8_t aFactorsA
, i16x8_t aFactorsB
);
94 i32x4_t
MulAdd16x8x2To32x4(u16x8_t aFactorsA
, u16x8_t aFactorsB
);
96 // Set all four 32-bit components to the value of the component at aIndex.
97 template<int8_t aIndex
>
98 i32x4_t
Splat32(i32x4_t aM
);
100 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
101 // re-interpret the result as sixteen 8-bit values.
102 template<int8_t aIndex
>
103 u8x16_t
Splat32On8(u8x16_t aM
);
105 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i32x4
Shuffle32(i32x4 aM
);
106 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i16x8
ShuffleLo16(i16x8 aM
);
107 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i16x8
ShuffleHi16(i16x8 aM
);
109 u8x16_t
InterleaveLo8(u8x16_t m1
, u8x16_t m2
);
110 u8x16_t
InterleaveHi8(u8x16_t m1
, u8x16_t m2
);
111 i16x8_t
InterleaveLo16(i16x8_t m1
, i16x8_t m2
);
112 i16x8_t
InterleaveHi16(i16x8_t m1
, i16x8_t m2
);
113 i32x4_t
InterleaveLo32(i32x4_t m1
, i32x4_t m2
);
115 i16x8_t
UnpackLo8x8ToI16x8(u8x16_t m
);
116 i16x8_t
UnpackHi8x8ToI16x8(u8x16_t m
);
117 u16x8_t
UnpackLo8x8ToU16x8(u8x16_t m
);
118 u16x8_t
UnpackHi8x8ToU16x8(u8x16_t m
);
120 i16x8_t
PackAndSaturate32To16(i32x4_t m1
, i32x4_t m2
);
121 u8x16_t
PackAndSaturate16To8(i16x8_t m1
, i16x8_t m2
);
122 u8x16_t
PackAndSaturate32To8(i32x4_t m1
, i32x4_t m2
, i32x4_t m3
, const i32x4_t
& m4
);
124 i32x4
FastDivideBy255(i32x4 m
);
125 i16x8
FastDivideBy255_16(i16x8 m
);
131 struct Scalaru8x16_t
{
135 union Scalari16x8_t
{
140 typedef Scalari16x8_t Scalaru16x8_t
;
142 struct Scalari32x4_t
{
146 struct Scalarf32x4_t
{
151 inline Scalaru8x16_t Load8
<Scalaru8x16_t
>(const uint8_t* aSource
) {
152 return *(Scalaru8x16_t
*)aSource
;
155 inline void Store8(uint8_t* aTarget
, Scalaru8x16_t aM
) {
156 *(Scalaru8x16_t
*)aTarget
= aM
;
160 inline Scalaru8x16_t From8
<Scalaru8x16_t
>(uint8_t a
, uint8_t b
, uint8_t c
,
161 uint8_t d
, uint8_t e
, uint8_t f
,
162 uint8_t g
, uint8_t h
, uint8_t i
,
163 uint8_t j
, uint8_t k
, uint8_t l
,
164 uint8_t m
, uint8_t n
, uint8_t o
,
187 inline Scalaru8x16_t FromZero8
<Scalaru8x16_t
>() {
188 return From8
<Scalaru8x16_t
>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
192 inline Scalari16x8_t FromI16
<Scalari16x8_t
>(int16_t a
, int16_t b
, int16_t c
,
193 int16_t d
, int16_t e
, int16_t f
,
194 int16_t g
, int16_t h
) {
208 inline Scalaru16x8_t FromU16
<Scalaru16x8_t
>(uint16_t a
, uint16_t b
, uint16_t c
,
209 uint16_t d
, uint16_t e
, uint16_t f
,
210 uint16_t g
, uint16_t h
) {
224 inline Scalari16x8_t FromI16
<Scalari16x8_t
>(int16_t a
) {
225 return FromI16
<Scalari16x8_t
>(a
, a
, a
, a
, a
, a
, a
, a
);
229 inline Scalaru16x8_t FromU16
<Scalaru16x8_t
>(uint16_t a
) {
230 return FromU16
<Scalaru16x8_t
>(a
, a
, a
, a
, a
, a
, a
, a
);
234 inline Scalari32x4_t From32
<Scalari32x4_t
>(int32_t a
, int32_t b
, int32_t c
,
245 inline Scalarf32x4_t FromF32
<Scalarf32x4_t
>(float a
, float b
, float c
,
256 inline Scalarf32x4_t FromF32
<Scalarf32x4_t
>(float a
) {
257 return FromF32
<Scalarf32x4_t
>(a
, a
, a
, a
);
261 inline Scalari32x4_t From32
<Scalari32x4_t
>(int32_t a
) {
262 return From32
<Scalari32x4_t
>(a
, a
, a
, a
);
265 template <int32_t aNumberOfBits
>
266 inline Scalari16x8_t
ShiftRight16(Scalari16x8_t aM
) {
267 return FromI16
<Scalari16x8_t
>(uint16_t(aM
.i16
[0]) >> aNumberOfBits
,
268 uint16_t(aM
.i16
[1]) >> aNumberOfBits
,
269 uint16_t(aM
.i16
[2]) >> aNumberOfBits
,
270 uint16_t(aM
.i16
[3]) >> aNumberOfBits
,
271 uint16_t(aM
.i16
[4]) >> aNumberOfBits
,
272 uint16_t(aM
.i16
[5]) >> aNumberOfBits
,
273 uint16_t(aM
.i16
[6]) >> aNumberOfBits
,
274 uint16_t(aM
.i16
[7]) >> aNumberOfBits
);
277 template <int32_t aNumberOfBits
>
278 inline Scalari32x4_t
ShiftRight32(Scalari32x4_t aM
) {
279 return From32
<Scalari32x4_t
>(
280 aM
.i32
[0] >> aNumberOfBits
, aM
.i32
[1] >> aNumberOfBits
,
281 aM
.i32
[2] >> aNumberOfBits
, aM
.i32
[3] >> aNumberOfBits
);
284 inline Scalaru16x8_t
Add16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
) {
285 return FromU16
<Scalaru16x8_t
>(
286 aM1
.u16
[0] + aM2
.u16
[0], aM1
.u16
[1] + aM2
.u16
[1], aM1
.u16
[2] + aM2
.u16
[2],
287 aM1
.u16
[3] + aM2
.u16
[3], aM1
.u16
[4] + aM2
.u16
[4], aM1
.u16
[5] + aM2
.u16
[5],
288 aM1
.u16
[6] + aM2
.u16
[6], aM1
.u16
[7] + aM2
.u16
[7]);
291 inline Scalari32x4_t
Add32(Scalari32x4_t aM1
, Scalari32x4_t aM2
) {
292 return From32
<Scalari32x4_t
>(aM1
.i32
[0] + aM2
.i32
[0], aM1
.i32
[1] + aM2
.i32
[1],
293 aM1
.i32
[2] + aM2
.i32
[2],
294 aM1
.i32
[3] + aM2
.i32
[3]);
297 inline Scalaru16x8_t
Sub16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
) {
298 return FromU16
<Scalaru16x8_t
>(
299 aM1
.u16
[0] - aM2
.u16
[0], aM1
.u16
[1] - aM2
.u16
[1], aM1
.u16
[2] - aM2
.u16
[2],
300 aM1
.u16
[3] - aM2
.u16
[3], aM1
.u16
[4] - aM2
.u16
[4], aM1
.u16
[5] - aM2
.u16
[5],
301 aM1
.u16
[6] - aM2
.u16
[6], aM1
.u16
[7] - aM2
.u16
[7]);
304 inline Scalari32x4_t
Sub32(Scalari32x4_t aM1
, Scalari32x4_t aM2
) {
305 return From32
<Scalari32x4_t
>(aM1
.i32
[0] - aM2
.i32
[0], aM1
.i32
[1] - aM2
.i32
[1],
306 aM1
.i32
[2] - aM2
.i32
[2],
307 aM1
.i32
[3] - aM2
.i32
[3]);
310 inline int32_t umin(int32_t a
, int32_t b
) { return a
- ((a
- b
) & -(a
> b
)); }
312 inline int32_t umax(int32_t a
, int32_t b
) { return a
- ((a
- b
) & -(a
< b
)); }
314 inline Scalaru8x16_t
Min8(Scalaru8x16_t aM1
, Scalaru8x16_t aM2
) {
315 return From8
<Scalaru8x16_t
>(
316 umin(aM1
.u8
[0], aM2
.u8
[0]), umin(aM1
.u8
[1], aM2
.u8
[1]),
317 umin(aM1
.u8
[2], aM2
.u8
[2]), umin(aM1
.u8
[3], aM2
.u8
[3]),
318 umin(aM1
.u8
[4], aM2
.u8
[4]), umin(aM1
.u8
[5], aM2
.u8
[5]),
319 umin(aM1
.u8
[6], aM2
.u8
[6]), umin(aM1
.u8
[7], aM2
.u8
[7]),
320 umin(aM1
.u8
[8 + 0], aM2
.u8
[8 + 0]), umin(aM1
.u8
[8 + 1], aM2
.u8
[8 + 1]),
321 umin(aM1
.u8
[8 + 2], aM2
.u8
[8 + 2]), umin(aM1
.u8
[8 + 3], aM2
.u8
[8 + 3]),
322 umin(aM1
.u8
[8 + 4], aM2
.u8
[8 + 4]), umin(aM1
.u8
[8 + 5], aM2
.u8
[8 + 5]),
323 umin(aM1
.u8
[8 + 6], aM2
.u8
[8 + 6]), umin(aM1
.u8
[8 + 7], aM2
.u8
[8 + 7]));
326 inline Scalaru8x16_t
Max8(Scalaru8x16_t aM1
, Scalaru8x16_t aM2
) {
327 return From8
<Scalaru8x16_t
>(
328 umax(aM1
.u8
[0], aM2
.u8
[0]), umax(aM1
.u8
[1], aM2
.u8
[1]),
329 umax(aM1
.u8
[2], aM2
.u8
[2]), umax(aM1
.u8
[3], aM2
.u8
[3]),
330 umax(aM1
.u8
[4], aM2
.u8
[4]), umax(aM1
.u8
[5], aM2
.u8
[5]),
331 umax(aM1
.u8
[6], aM2
.u8
[6]), umax(aM1
.u8
[7], aM2
.u8
[7]),
332 umax(aM1
.u8
[8 + 0], aM2
.u8
[8 + 0]), umax(aM1
.u8
[8 + 1], aM2
.u8
[8 + 1]),
333 umax(aM1
.u8
[8 + 2], aM2
.u8
[8 + 2]), umax(aM1
.u8
[8 + 3], aM2
.u8
[8 + 3]),
334 umax(aM1
.u8
[8 + 4], aM2
.u8
[8 + 4]), umax(aM1
.u8
[8 + 5], aM2
.u8
[8 + 5]),
335 umax(aM1
.u8
[8 + 6], aM2
.u8
[8 + 6]), umax(aM1
.u8
[8 + 7], aM2
.u8
[8 + 7]));
338 inline Scalari32x4_t
Min32(Scalari32x4_t aM1
, Scalari32x4_t aM2
) {
339 return From32
<Scalari32x4_t
>(
340 umin(aM1
.i32
[0], aM2
.i32
[0]), umin(aM1
.i32
[1], aM2
.i32
[1]),
341 umin(aM1
.i32
[2], aM2
.i32
[2]), umin(aM1
.i32
[3], aM2
.i32
[3]));
344 inline Scalari32x4_t
Max32(Scalari32x4_t aM1
, Scalari32x4_t aM2
) {
345 return From32
<Scalari32x4_t
>(
346 umax(aM1
.i32
[0], aM2
.i32
[0]), umax(aM1
.i32
[1], aM2
.i32
[1]),
347 umax(aM1
.i32
[2], aM2
.i32
[2]), umax(aM1
.i32
[3], aM2
.i32
[3]));
350 inline Scalaru16x8_t
Mul16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
) {
351 return FromU16
<Scalaru16x8_t
>(
352 uint16_t(int32_t(aM1
.u16
[0]) * int32_t(aM2
.u16
[0])),
353 uint16_t(int32_t(aM1
.u16
[1]) * int32_t(aM2
.u16
[1])),
354 uint16_t(int32_t(aM1
.u16
[2]) * int32_t(aM2
.u16
[2])),
355 uint16_t(int32_t(aM1
.u16
[3]) * int32_t(aM2
.u16
[3])),
356 uint16_t(int32_t(aM1
.u16
[4]) * int32_t(aM2
.u16
[4])),
357 uint16_t(int32_t(aM1
.u16
[5]) * int32_t(aM2
.u16
[5])),
358 uint16_t(int32_t(aM1
.u16
[6]) * int32_t(aM2
.u16
[6])),
359 uint16_t(int32_t(aM1
.u16
[7]) * int32_t(aM2
.u16
[7])));
362 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1
,
363 Scalari16x8_t aFactorsA2B2
,
364 Scalari32x4_t
& aProductA
,
365 Scalari32x4_t
& aProductB
) {
366 aProductA
= From32
<Scalari32x4_t
>(aFactorsA1B1
.i16
[0] * aFactorsA2B2
.i16
[0],
367 aFactorsA1B1
.i16
[1] * aFactorsA2B2
.i16
[1],
368 aFactorsA1B1
.i16
[2] * aFactorsA2B2
.i16
[2],
369 aFactorsA1B1
.i16
[3] * aFactorsA2B2
.i16
[3]);
370 aProductB
= From32
<Scalari32x4_t
>(aFactorsA1B1
.i16
[4] * aFactorsA2B2
.i16
[4],
371 aFactorsA1B1
.i16
[5] * aFactorsA2B2
.i16
[5],
372 aFactorsA1B1
.i16
[6] * aFactorsA2B2
.i16
[6],
373 aFactorsA1B1
.i16
[7] * aFactorsA2B2
.i16
[7]);
376 inline Scalari32x4_t
MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA
,
377 Scalari16x8_t aFactorsB
) {
378 return From32
<Scalari32x4_t
>(
379 aFactorsA
.i16
[0] * aFactorsB
.i16
[0] + aFactorsA
.i16
[1] * aFactorsB
.i16
[1],
380 aFactorsA
.i16
[2] * aFactorsB
.i16
[2] + aFactorsA
.i16
[3] * aFactorsB
.i16
[3],
381 aFactorsA
.i16
[4] * aFactorsB
.i16
[4] + aFactorsA
.i16
[5] * aFactorsB
.i16
[5],
382 aFactorsA
.i16
[6] * aFactorsB
.i16
[6] +
383 aFactorsA
.i16
[7] * aFactorsB
.i16
[7]);
386 template <int8_t aIndex
>
387 inline void AssertIndex() {
388 static_assert(aIndex
== 0 || aIndex
== 1 || aIndex
== 2 || aIndex
== 3,
389 "Invalid splat index");
392 template <int8_t aIndex
>
393 inline Scalari32x4_t
Splat32(Scalari32x4_t aM
) {
394 AssertIndex
<aIndex
>();
395 return From32
<Scalari32x4_t
>(aM
.i32
[aIndex
], aM
.i32
[aIndex
], aM
.i32
[aIndex
],
400 inline Scalaru8x16_t
Splat32On8(Scalaru8x16_t aM
) {
402 return From8
<Scalaru8x16_t
>(
403 aM
.u8
[i
* 4], aM
.u8
[i
* 4 + 1], aM
.u8
[i
* 4 + 2], aM
.u8
[i
* 4 + 3],
404 aM
.u8
[i
* 4], aM
.u8
[i
* 4 + 1], aM
.u8
[i
* 4 + 2], aM
.u8
[i
* 4 + 3],
405 aM
.u8
[i
* 4], aM
.u8
[i
* 4 + 1], aM
.u8
[i
* 4 + 2], aM
.u8
[i
* 4 + 3],
406 aM
.u8
[i
* 4], aM
.u8
[i
* 4 + 1], aM
.u8
[i
* 4 + 2], aM
.u8
[i
* 4 + 3]);
409 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
410 inline Scalari32x4_t
Shuffle32(Scalari32x4_t aM
) {
415 Scalari32x4_t m
= aM
;
416 m
.i32
[0] = aM
.i32
[i3
];
417 m
.i32
[1] = aM
.i32
[i2
];
418 m
.i32
[2] = aM
.i32
[i1
];
419 m
.i32
[3] = aM
.i32
[i0
];
423 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
424 inline Scalari16x8_t
ShuffleLo16(Scalari16x8_t aM
) {
429 Scalari16x8_t m
= aM
;
430 m
.i16
[0] = aM
.i16
[i3
];
431 m
.i16
[1] = aM
.i16
[i2
];
432 m
.i16
[2] = aM
.i16
[i1
];
433 m
.i16
[3] = aM
.i16
[i0
];
437 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
438 inline Scalari16x8_t
ShuffleHi16(Scalari16x8_t aM
) {
443 Scalari16x8_t m
= aM
;
444 m
.i16
[4 + 0] = aM
.i16
[4 + i3
];
445 m
.i16
[4 + 1] = aM
.i16
[4 + i2
];
446 m
.i16
[4 + 2] = aM
.i16
[4 + i1
];
447 m
.i16
[4 + 3] = aM
.i16
[4 + i0
];
451 template <int8_t aIndexLo
, int8_t aIndexHi
>
452 inline Scalaru16x8_t
Splat16(Scalaru16x8_t aM
) {
453 AssertIndex
<aIndexLo
>();
454 AssertIndex
<aIndexHi
>();
456 int16_t chosenValueLo
= aM
.u16
[aIndexLo
];
457 m
.u16
[0] = chosenValueLo
;
458 m
.u16
[1] = chosenValueLo
;
459 m
.u16
[2] = chosenValueLo
;
460 m
.u16
[3] = chosenValueLo
;
461 int16_t chosenValueHi
= aM
.u16
[4 + aIndexHi
];
462 m
.u16
[4] = chosenValueHi
;
463 m
.u16
[5] = chosenValueHi
;
464 m
.u16
[6] = chosenValueHi
;
465 m
.u16
[7] = chosenValueHi
;
469 inline Scalaru8x16_t
InterleaveLo8(Scalaru8x16_t m1
, Scalaru8x16_t m2
) {
470 return From8
<Scalaru8x16_t
>(m1
.u8
[0], m2
.u8
[0], m1
.u8
[1], m2
.u8
[1], m1
.u8
[2],
471 m2
.u8
[2], m1
.u8
[3], m2
.u8
[3], m1
.u8
[4], m2
.u8
[4],
472 m1
.u8
[5], m2
.u8
[5], m1
.u8
[6], m2
.u8
[6], m1
.u8
[7],
476 inline Scalaru8x16_t
InterleaveHi8(Scalaru8x16_t m1
, Scalaru8x16_t m2
) {
477 return From8
<Scalaru8x16_t
>(
478 m1
.u8
[8 + 0], m2
.u8
[8 + 0], m1
.u8
[8 + 1], m2
.u8
[8 + 1], m1
.u8
[8 + 2],
479 m2
.u8
[8 + 2], m1
.u8
[8 + 3], m2
.u8
[8 + 3], m1
.u8
[8 + 4], m2
.u8
[8 + 4],
480 m1
.u8
[8 + 5], m2
.u8
[8 + 5], m1
.u8
[8 + 6], m2
.u8
[8 + 6], m1
.u8
[8 + 7],
484 inline Scalaru16x8_t
InterleaveLo16(Scalaru16x8_t m1
, Scalaru16x8_t m2
) {
485 return FromU16
<Scalaru16x8_t
>(m1
.u16
[0], m2
.u16
[0], m1
.u16
[1], m2
.u16
[1],
486 m1
.u16
[2], m2
.u16
[2], m1
.u16
[3], m2
.u16
[3]);
489 inline Scalaru16x8_t
InterleaveHi16(Scalaru16x8_t m1
, Scalaru16x8_t m2
) {
490 return FromU16
<Scalaru16x8_t
>(m1
.u16
[4], m2
.u16
[4], m1
.u16
[5], m2
.u16
[5],
491 m1
.u16
[6], m2
.u16
[6], m1
.u16
[7], m2
.u16
[7]);
494 inline Scalari32x4_t
InterleaveLo32(Scalari32x4_t m1
, Scalari32x4_t m2
) {
495 return From32
<Scalari32x4_t
>(m1
.i32
[0], m2
.i32
[0], m1
.i32
[1], m2
.i32
[1]);
498 inline Scalari16x8_t
UnpackLo8x8ToI16x8(Scalaru8x16_t aM
) {
511 inline Scalari16x8_t
UnpackHi8x8ToI16x8(Scalaru8x16_t aM
) {
513 m
.i16
[0] = aM
.u8
[8 + 0];
514 m
.i16
[1] = aM
.u8
[8 + 1];
515 m
.i16
[2] = aM
.u8
[8 + 2];
516 m
.i16
[3] = aM
.u8
[8 + 3];
517 m
.i16
[4] = aM
.u8
[8 + 4];
518 m
.i16
[5] = aM
.u8
[8 + 5];
519 m
.i16
[6] = aM
.u8
[8 + 6];
520 m
.i16
[7] = aM
.u8
[8 + 7];
524 inline Scalaru16x8_t
UnpackLo8x8ToU16x8(Scalaru8x16_t aM
) {
525 return FromU16
<Scalaru16x8_t
>(uint16_t(aM
.u8
[0]), uint16_t(aM
.u8
[1]),
526 uint16_t(aM
.u8
[2]), uint16_t(aM
.u8
[3]),
527 uint16_t(aM
.u8
[4]), uint16_t(aM
.u8
[5]),
528 uint16_t(aM
.u8
[6]), uint16_t(aM
.u8
[7]));
531 inline Scalaru16x8_t
UnpackHi8x8ToU16x8(Scalaru8x16_t aM
) {
532 return FromU16
<Scalaru16x8_t
>(aM
.u8
[8 + 0], aM
.u8
[8 + 1], aM
.u8
[8 + 2],
533 aM
.u8
[8 + 3], aM
.u8
[8 + 4], aM
.u8
[8 + 5],
534 aM
.u8
[8 + 6], aM
.u8
[8 + 7]);
537 template <uint8_t aNumBytes
>
538 inline Scalaru8x16_t
Rotate8(Scalaru8x16_t a1234
, Scalaru8x16_t a5678
) {
540 for (uint8_t i
= 0; i
< 16; i
++) {
541 uint8_t sourceByte
= i
+ aNumBytes
;
543 sourceByte
< 16 ? a1234
.u8
[sourceByte
] : a5678
.u8
[sourceByte
- 16];
548 template <typename T
>
549 inline int16_t SaturateTo16(T a
) {
550 return int16_t(a
>= INT16_MIN
? (a
<= INT16_MAX
? a
: INT16_MAX
) : INT16_MIN
);
553 inline Scalari16x8_t
PackAndSaturate32To16(Scalari32x4_t m1
, Scalari32x4_t m2
) {
555 m
.i16
[0] = SaturateTo16(m1
.i32
[0]);
556 m
.i16
[1] = SaturateTo16(m1
.i32
[1]);
557 m
.i16
[2] = SaturateTo16(m1
.i32
[2]);
558 m
.i16
[3] = SaturateTo16(m1
.i32
[3]);
559 m
.i16
[4] = SaturateTo16(m2
.i32
[0]);
560 m
.i16
[5] = SaturateTo16(m2
.i32
[1]);
561 m
.i16
[6] = SaturateTo16(m2
.i32
[2]);
562 m
.i16
[7] = SaturateTo16(m2
.i32
[3]);
566 template <typename T
>
567 inline uint16_t SaturateToU16(T a
) {
568 return uint16_t(umin(a
& -(a
>= 0), INT16_MAX
));
571 inline Scalaru16x8_t
PackAndSaturate32ToU16(Scalari32x4_t m1
,
574 m
.u16
[0] = SaturateToU16(m1
.i32
[0]);
575 m
.u16
[1] = SaturateToU16(m1
.i32
[1]);
576 m
.u16
[2] = SaturateToU16(m1
.i32
[2]);
577 m
.u16
[3] = SaturateToU16(m1
.i32
[3]);
578 m
.u16
[4] = SaturateToU16(m2
.i32
[0]);
579 m
.u16
[5] = SaturateToU16(m2
.i32
[1]);
580 m
.u16
[6] = SaturateToU16(m2
.i32
[2]);
581 m
.u16
[7] = SaturateToU16(m2
.i32
[3]);
585 template <typename T
>
586 inline uint8_t SaturateTo8(T a
) {
587 return uint8_t(umin(a
& -(a
>= 0), 255));
590 inline Scalaru8x16_t
PackAndSaturate32To8(Scalari32x4_t m1
, Scalari32x4_t m2
,
592 const Scalari32x4_t
& m4
) {
594 m
.u8
[0] = SaturateTo8(m1
.i32
[0]);
595 m
.u8
[1] = SaturateTo8(m1
.i32
[1]);
596 m
.u8
[2] = SaturateTo8(m1
.i32
[2]);
597 m
.u8
[3] = SaturateTo8(m1
.i32
[3]);
598 m
.u8
[4] = SaturateTo8(m2
.i32
[0]);
599 m
.u8
[5] = SaturateTo8(m2
.i32
[1]);
600 m
.u8
[6] = SaturateTo8(m2
.i32
[2]);
601 m
.u8
[7] = SaturateTo8(m2
.i32
[3]);
602 m
.u8
[8] = SaturateTo8(m3
.i32
[0]);
603 m
.u8
[9] = SaturateTo8(m3
.i32
[1]);
604 m
.u8
[10] = SaturateTo8(m3
.i32
[2]);
605 m
.u8
[11] = SaturateTo8(m3
.i32
[3]);
606 m
.u8
[12] = SaturateTo8(m4
.i32
[0]);
607 m
.u8
[13] = SaturateTo8(m4
.i32
[1]);
608 m
.u8
[14] = SaturateTo8(m4
.i32
[2]);
609 m
.u8
[15] = SaturateTo8(m4
.i32
[3]);
613 inline Scalaru8x16_t
PackAndSaturate16To8(Scalari16x8_t m1
, Scalari16x8_t m2
) {
615 m
.u8
[0] = SaturateTo8(m1
.i16
[0]);
616 m
.u8
[1] = SaturateTo8(m1
.i16
[1]);
617 m
.u8
[2] = SaturateTo8(m1
.i16
[2]);
618 m
.u8
[3] = SaturateTo8(m1
.i16
[3]);
619 m
.u8
[4] = SaturateTo8(m1
.i16
[4]);
620 m
.u8
[5] = SaturateTo8(m1
.i16
[5]);
621 m
.u8
[6] = SaturateTo8(m1
.i16
[6]);
622 m
.u8
[7] = SaturateTo8(m1
.i16
[7]);
623 m
.u8
[8] = SaturateTo8(m2
.i16
[0]);
624 m
.u8
[9] = SaturateTo8(m2
.i16
[1]);
625 m
.u8
[10] = SaturateTo8(m2
.i16
[2]);
626 m
.u8
[11] = SaturateTo8(m2
.i16
[3]);
627 m
.u8
[12] = SaturateTo8(m2
.i16
[4]);
628 m
.u8
[13] = SaturateTo8(m2
.i16
[5]);
629 m
.u8
[14] = SaturateTo8(m2
.i16
[6]);
630 m
.u8
[15] = SaturateTo8(m2
.i16
[7]);
634 // Fast approximate division by 255. It has the property that
635 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
636 // But it only uses two adds and two shifts instead of an
637 // integer division (which is expensive on many processors).
639 // equivalent to v/255
640 template <class B
, class A
>
641 inline B
FastDivideBy255(A v
) {
642 return ((v
<< 8) + v
+ 255) >> 16;
645 inline Scalaru16x8_t
FastDivideBy255_16(Scalaru16x8_t m
) {
646 return FromU16
<Scalaru16x8_t
>(FastDivideBy255
<uint16_t>(int32_t(m
.u16
[0])),
647 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[1])),
648 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[2])),
649 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[3])),
650 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[4])),
651 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[5])),
652 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[6])),
653 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[7])));
656 inline Scalari32x4_t
FastDivideBy255(Scalari32x4_t m
) {
657 return From32
<Scalari32x4_t
>(
658 FastDivideBy255
<int32_t>(m
.i32
[0]), FastDivideBy255
<int32_t>(m
.i32
[1]),
659 FastDivideBy255
<int32_t>(m
.i32
[2]), FastDivideBy255
<int32_t>(m
.i32
[3]));
662 inline Scalaru8x16_t
Pick(Scalaru8x16_t mask
, Scalaru8x16_t a
,
664 return From8
<Scalaru8x16_t
>(
665 (a
.u8
[0] & (~mask
.u8
[0])) | (b
.u8
[0] & mask
.u8
[0]),
666 (a
.u8
[1] & (~mask
.u8
[1])) | (b
.u8
[1] & mask
.u8
[1]),
667 (a
.u8
[2] & (~mask
.u8
[2])) | (b
.u8
[2] & mask
.u8
[2]),
668 (a
.u8
[3] & (~mask
.u8
[3])) | (b
.u8
[3] & mask
.u8
[3]),
669 (a
.u8
[4] & (~mask
.u8
[4])) | (b
.u8
[4] & mask
.u8
[4]),
670 (a
.u8
[5] & (~mask
.u8
[5])) | (b
.u8
[5] & mask
.u8
[5]),
671 (a
.u8
[6] & (~mask
.u8
[6])) | (b
.u8
[6] & mask
.u8
[6]),
672 (a
.u8
[7] & (~mask
.u8
[7])) | (b
.u8
[7] & mask
.u8
[7]),
673 (a
.u8
[8 + 0] & (~mask
.u8
[8 + 0])) | (b
.u8
[8 + 0] & mask
.u8
[8 + 0]),
674 (a
.u8
[8 + 1] & (~mask
.u8
[8 + 1])) | (b
.u8
[8 + 1] & mask
.u8
[8 + 1]),
675 (a
.u8
[8 + 2] & (~mask
.u8
[8 + 2])) | (b
.u8
[8 + 2] & mask
.u8
[8 + 2]),
676 (a
.u8
[8 + 3] & (~mask
.u8
[8 + 3])) | (b
.u8
[8 + 3] & mask
.u8
[8 + 3]),
677 (a
.u8
[8 + 4] & (~mask
.u8
[8 + 4])) | (b
.u8
[8 + 4] & mask
.u8
[8 + 4]),
678 (a
.u8
[8 + 5] & (~mask
.u8
[8 + 5])) | (b
.u8
[8 + 5] & mask
.u8
[8 + 5]),
679 (a
.u8
[8 + 6] & (~mask
.u8
[8 + 6])) | (b
.u8
[8 + 6] & mask
.u8
[8 + 6]),
680 (a
.u8
[8 + 7] & (~mask
.u8
[8 + 7])) | (b
.u8
[8 + 7] & mask
.u8
[8 + 7]));
683 inline Scalari32x4_t
Pick(Scalari32x4_t mask
, Scalari32x4_t a
,
685 return From32
<Scalari32x4_t
>(
686 (a
.i32
[0] & (~mask
.i32
[0])) | (b
.i32
[0] & mask
.i32
[0]),
687 (a
.i32
[1] & (~mask
.i32
[1])) | (b
.i32
[1] & mask
.i32
[1]),
688 (a
.i32
[2] & (~mask
.i32
[2])) | (b
.i32
[2] & mask
.i32
[2]),
689 (a
.i32
[3] & (~mask
.i32
[3])) | (b
.i32
[3] & mask
.i32
[3]));
692 inline Scalarf32x4_t
MixF32(Scalarf32x4_t a
, Scalarf32x4_t b
, float t
) {
693 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] + (b
.f32
[0] - a
.f32
[0]) * t
,
694 a
.f32
[1] + (b
.f32
[1] - a
.f32
[1]) * t
,
695 a
.f32
[2] + (b
.f32
[2] - a
.f32
[2]) * t
,
696 a
.f32
[3] + (b
.f32
[3] - a
.f32
[3]) * t
);
699 inline Scalarf32x4_t
WSumF32(Scalarf32x4_t a
, Scalarf32x4_t b
, float wa
,
701 return FromF32
<Scalarf32x4_t
>(
702 a
.f32
[0] * wa
+ b
.f32
[0] * wb
, a
.f32
[1] * wa
+ b
.f32
[1] * wb
,
703 a
.f32
[2] * wa
+ b
.f32
[2] * wb
, a
.f32
[3] * wa
+ b
.f32
[3] * wb
);
706 inline Scalarf32x4_t
AbsF32(Scalarf32x4_t a
) {
707 return FromF32
<Scalarf32x4_t
>(fabs(a
.f32
[0]), fabs(a
.f32
[1]), fabs(a
.f32
[2]),
711 inline Scalarf32x4_t
AddF32(Scalarf32x4_t a
, Scalarf32x4_t b
) {
712 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] + b
.f32
[0], a
.f32
[1] + b
.f32
[1],
713 a
.f32
[2] + b
.f32
[2], a
.f32
[3] + b
.f32
[3]);
716 inline Scalarf32x4_t
MulF32(Scalarf32x4_t a
, Scalarf32x4_t b
) {
717 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] * b
.f32
[0], a
.f32
[1] * b
.f32
[1],
718 a
.f32
[2] * b
.f32
[2], a
.f32
[3] * b
.f32
[3]);
721 inline Scalarf32x4_t
DivF32(Scalarf32x4_t a
, Scalarf32x4_t b
) {
722 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] / b
.f32
[0], a
.f32
[1] / b
.f32
[1],
723 a
.f32
[2] / b
.f32
[2], a
.f32
[3] / b
.f32
[3]);
726 template <uint8_t aIndex
>
727 inline Scalarf32x4_t
SplatF32(Scalarf32x4_t m
) {
728 AssertIndex
<aIndex
>();
729 return FromF32
<Scalarf32x4_t
>(m
.f32
[aIndex
], m
.f32
[aIndex
], m
.f32
[aIndex
],
733 inline Scalari32x4_t
F32ToI32(Scalarf32x4_t m
) {
734 return From32
<Scalari32x4_t
>(
735 int32_t(floor(m
.f32
[0] + 0.5f
)), int32_t(floor(m
.f32
[1] + 0.5f
)),
736 int32_t(floor(m
.f32
[2] + 0.5f
)), int32_t(floor(m
.f32
[3] + 0.5f
)));
739 #ifdef SIMD_COMPILE_SSE2
744 inline __m128i Load8
<__m128i
>(const uint8_t* aSource
) {
745 return _mm_load_si128((const __m128i
*)aSource
);
748 inline void Store8(uint8_t* aTarget
, __m128i aM
) {
749 _mm_store_si128((__m128i
*)aTarget
, aM
);
753 inline __m128i FromZero8
<__m128i
>() {
754 return _mm_setzero_si128();
758 inline __m128i From8
<__m128i
>(uint8_t a
, uint8_t b
, uint8_t c
, uint8_t d
,
759 uint8_t e
, uint8_t f
, uint8_t g
, uint8_t h
,
760 uint8_t i
, uint8_t j
, uint8_t k
, uint8_t l
,
761 uint8_t m
, uint8_t n
, uint8_t o
, uint8_t p
) {
762 return _mm_setr_epi16((b
<< 8) + a
, (d
<< 8) + c
, (e
<< 8) + f
, (h
<< 8) + g
,
763 (j
<< 8) + i
, (l
<< 8) + k
, (m
<< 8) + n
, (p
<< 8) + o
);
767 inline __m128i FromI16
<__m128i
>(int16_t a
, int16_t b
, int16_t c
, int16_t d
,
768 int16_t e
, int16_t f
, int16_t g
, int16_t h
) {
769 return _mm_setr_epi16(a
, b
, c
, d
, e
, f
, g
, h
);
773 inline __m128i FromU16
<__m128i
>(uint16_t a
, uint16_t b
, uint16_t c
, uint16_t d
,
774 uint16_t e
, uint16_t f
, uint16_t g
,
776 return _mm_setr_epi16(a
, b
, c
, d
, e
, f
, g
, h
);
780 inline __m128i FromI16
<__m128i
>(int16_t a
) {
781 return _mm_set1_epi16(a
);
785 inline __m128i FromU16
<__m128i
>(uint16_t a
) {
786 return _mm_set1_epi16((int16_t)a
);
790 inline __m128i From32
<__m128i
>(int32_t a
, int32_t b
, int32_t c
, int32_t d
) {
791 return _mm_setr_epi32(a
, b
, c
, d
);
795 inline __m128i From32
<__m128i
>(int32_t a
) {
796 return _mm_set1_epi32(a
);
800 inline __m128 FromF32
<__m128
>(float a
, float b
, float c
, float d
) {
801 return _mm_setr_ps(a
, b
, c
, d
);
805 inline __m128 FromF32
<__m128
>(float a
) {
806 return _mm_set1_ps(a
);
809 template <int32_t aNumberOfBits
>
810 inline __m128i
ShiftRight16(__m128i aM
) {
811 return _mm_srli_epi16(aM
, aNumberOfBits
);
814 template <int32_t aNumberOfBits
>
815 inline __m128i
ShiftRight32(__m128i aM
) {
816 return _mm_srai_epi32(aM
, aNumberOfBits
);
819 inline __m128i
Add16(__m128i aM1
, __m128i aM2
) {
820 return _mm_add_epi16(aM1
, aM2
);
823 inline __m128i
Add32(__m128i aM1
, __m128i aM2
) {
824 return _mm_add_epi32(aM1
, aM2
);
827 inline __m128i
Sub16(__m128i aM1
, __m128i aM2
) {
828 return _mm_sub_epi16(aM1
, aM2
);
831 inline __m128i
Sub32(__m128i aM1
, __m128i aM2
) {
832 return _mm_sub_epi32(aM1
, aM2
);
835 inline __m128i
Min8(__m128i aM1
, __m128i aM2
) { return _mm_min_epu8(aM1
, aM2
); }
837 inline __m128i
Max8(__m128i aM1
, __m128i aM2
) { return _mm_max_epu8(aM1
, aM2
); }
839 inline __m128i
Min32(__m128i aM1
, __m128i aM2
) {
840 __m128i m1_minus_m2
= _mm_sub_epi32(aM1
, aM2
);
841 __m128i m1_greater_than_m2
= _mm_cmpgt_epi32(aM1
, aM2
);
842 return _mm_sub_epi32(aM1
, _mm_and_si128(m1_minus_m2
, m1_greater_than_m2
));
845 inline __m128i
Max32(__m128i aM1
, __m128i aM2
) {
846 __m128i m1_minus_m2
= _mm_sub_epi32(aM1
, aM2
);
847 __m128i m2_greater_than_m1
= _mm_cmpgt_epi32(aM2
, aM1
);
848 return _mm_sub_epi32(aM1
, _mm_and_si128(m1_minus_m2
, m2_greater_than_m1
));
851 inline __m128i
Mul16(__m128i aM1
, __m128i aM2
) {
852 return _mm_mullo_epi16(aM1
, aM2
);
855 inline __m128i
MulU16(__m128i aM1
, __m128i aM2
) {
856 return _mm_mullo_epi16(aM1
, aM2
);
859 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1
, __m128i aFactorsA2B2
,
860 __m128i
& aProductA
, __m128i
& aProductB
) {
861 __m128i prodAB_lo
= _mm_mullo_epi16(aFactorsA1B1
, aFactorsA2B2
);
862 __m128i prodAB_hi
= _mm_mulhi_epi16(aFactorsA1B1
, aFactorsA2B2
);
863 aProductA
= _mm_unpacklo_epi16(prodAB_lo
, prodAB_hi
);
864 aProductB
= _mm_unpackhi_epi16(prodAB_lo
, prodAB_hi
);
867 inline __m128i
MulAdd16x8x2To32x4(__m128i aFactorsA
, __m128i aFactorsB
) {
868 return _mm_madd_epi16(aFactorsA
, aFactorsB
);
871 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
872 inline __m128i
Shuffle32(__m128i aM
) {
877 return _mm_shuffle_epi32(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
880 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
881 inline __m128i
ShuffleLo16(__m128i aM
) {
886 return _mm_shufflelo_epi16(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
889 template <int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
890 inline __m128i
ShuffleHi16(__m128i aM
) {
895 return _mm_shufflehi_epi16(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
898 template <int8_t aIndex
>
899 inline __m128i
Splat32(__m128i aM
) {
900 return Shuffle32
<aIndex
, aIndex
, aIndex
, aIndex
>(aM
);
903 template <int8_t aIndex
>
904 inline __m128i
Splat32On8(__m128i aM
) {
905 return Shuffle32
<aIndex
, aIndex
, aIndex
, aIndex
>(aM
);
908 template <int8_t aIndexLo
, int8_t aIndexHi
>
909 inline __m128i
Splat16(__m128i aM
) {
910 AssertIndex
<aIndexLo
>();
911 AssertIndex
<aIndexHi
>();
912 return ShuffleHi16
<aIndexHi
, aIndexHi
, aIndexHi
, aIndexHi
>(
913 ShuffleLo16
<aIndexLo
, aIndexLo
, aIndexLo
, aIndexLo
>(aM
));
916 inline __m128i
UnpackLo8x8ToI16x8(__m128i m
) {
917 __m128i zero
= _mm_set1_epi8(0);
918 return _mm_unpacklo_epi8(m
, zero
);
921 inline __m128i
UnpackHi8x8ToI16x8(__m128i m
) {
922 __m128i zero
= _mm_set1_epi8(0);
923 return _mm_unpackhi_epi8(m
, zero
);
926 inline __m128i
UnpackLo8x8ToU16x8(__m128i m
) {
927 __m128i zero
= _mm_set1_epi8(0);
928 return _mm_unpacklo_epi8(m
, zero
);
931 inline __m128i
UnpackHi8x8ToU16x8(__m128i m
) {
932 __m128i zero
= _mm_set1_epi8(0);
933 return _mm_unpackhi_epi8(m
, zero
);
936 inline __m128i
InterleaveLo8(__m128i m1
, __m128i m2
) {
937 return _mm_unpacklo_epi8(m1
, m2
);
940 inline __m128i
InterleaveHi8(__m128i m1
, __m128i m2
) {
941 return _mm_unpackhi_epi8(m1
, m2
);
944 inline __m128i
InterleaveLo16(__m128i m1
, __m128i m2
) {
945 return _mm_unpacklo_epi16(m1
, m2
);
948 inline __m128i
InterleaveHi16(__m128i m1
, __m128i m2
) {
949 return _mm_unpackhi_epi16(m1
, m2
);
952 inline __m128i
InterleaveLo32(__m128i m1
, __m128i m2
) {
953 return _mm_unpacklo_epi32(m1
, m2
);
956 template <uint8_t aNumBytes
>
957 inline __m128i
Rotate8(__m128i a1234
, __m128i a5678
) {
958 return _mm_or_si128(_mm_srli_si128(a1234
, aNumBytes
),
959 _mm_slli_si128(a5678
, 16 - aNumBytes
));
962 inline __m128i
PackAndSaturate32To16(__m128i m1
, __m128i m2
) {
963 return _mm_packs_epi32(m1
, m2
);
966 inline __m128i
PackAndSaturate32ToU16(__m128i m1
, __m128i m2
) {
967 return _mm_packs_epi32(m1
, m2
);
970 inline __m128i
PackAndSaturate32To8(__m128i m1
, __m128i m2
, __m128i m3
,
972 // Pack into 8 16bit signed integers (saturating).
973 __m128i m12
= _mm_packs_epi32(m1
, m2
);
974 __m128i m34
= _mm_packs_epi32(m3
, m4
);
976 // Pack into 16 8bit unsigned integers (saturating).
977 return _mm_packus_epi16(m12
, m34
);
980 inline __m128i
PackAndSaturate16To8(__m128i m1
, __m128i m2
) {
981 // Pack into 16 8bit unsigned integers (saturating).
982 return _mm_packus_epi16(m1
, m2
);
985 inline __m128i
FastDivideBy255(__m128i m
) {
987 __m128i v
= _mm_slli_epi32(m
, 8);
988 // v = v + (m + (255,255,255,255))
989 v
= _mm_add_epi32(v
, _mm_add_epi32(m
, _mm_set1_epi32(255)));
991 return _mm_srai_epi32(v
, 16);
994 inline __m128i
FastDivideBy255_16(__m128i m
) {
995 __m128i zero
= _mm_set1_epi16(0);
996 __m128i lo
= _mm_unpacklo_epi16(m
, zero
);
997 __m128i hi
= _mm_unpackhi_epi16(m
, zero
);
998 return _mm_packs_epi32(FastDivideBy255(lo
), FastDivideBy255(hi
));
1001 inline __m128i
Pick(__m128i mask
, __m128i a
, __m128i b
) {
1002 return _mm_or_si128(_mm_andnot_si128(mask
, a
), _mm_and_si128(mask
, b
));
1005 inline __m128
MixF32(__m128 a
, __m128 b
, float t
) {
1006 return _mm_add_ps(a
, _mm_mul_ps(_mm_sub_ps(b
, a
), _mm_set1_ps(t
)));
1009 inline __m128
WSumF32(__m128 a
, __m128 b
, float wa
, float wb
) {
1010 return _mm_add_ps(_mm_mul_ps(a
, _mm_set1_ps(wa
)),
1011 _mm_mul_ps(b
, _mm_set1_ps(wb
)));
1014 inline __m128
AbsF32(__m128 a
) {
1015 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a
), a
);
1018 inline __m128
AddF32(__m128 a
, __m128 b
) { return _mm_add_ps(a
, b
); }
1020 inline __m128
MulF32(__m128 a
, __m128 b
) { return _mm_mul_ps(a
, b
); }
1022 inline __m128
DivF32(__m128 a
, __m128 b
) { return _mm_div_ps(a
, b
); }
1024 template <uint8_t aIndex
>
1025 inline __m128
SplatF32(__m128 m
) {
1026 AssertIndex
<aIndex
>();
1027 return _mm_shuffle_ps(m
, m
, _MM_SHUFFLE(aIndex
, aIndex
, aIndex
, aIndex
));
1030 inline __m128i
F32ToI32(__m128 m
) { return _mm_cvtps_epi32(m
); }
1032 #endif // SIMD_COMPILE_SSE2
1037 } // namespace mozilla
1039 #endif // _MOZILLA_GFX_SIMD_H_