1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef _MOZILLA_GFX_SIMD_H_
7 #define _MOZILLA_GFX_SIMD_H_
10 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
11 * if they want access to the SSE2 functions.
14 #ifdef SIMD_COMPILE_SSE2
15 #include <xmmintrin.h>
23 template<typename u8x16_t
>
24 u8x16_t
Load8(const uint8_t* aSource
);
26 template<typename u8x16_t
>
27 u8x16_t
From8(uint8_t a
, uint8_t b
, uint8_t c
, uint8_t d
, uint8_t e
, uint8_t f
, uint8_t g
, uint8_t h
,
28 uint8_t i
, uint8_t j
, uint8_t k
, uint8_t l
, uint8_t m
, uint8_t n
, uint8_t o
, uint8_t p
);
30 template<typename u8x16_t
>
33 template<typename i16x8_t
>
34 i16x8_t
FromI16(int16_t a
, int16_t b
, int16_t c
, int16_t d
, int16_t e
, int16_t f
, int16_t g
, int16_t h
);
36 template<typename u16x8_t
>
37 u16x8_t
FromU16(uint16_t a
, uint16_t b
, uint16_t c
, uint16_t d
, uint16_t e
, uint16_t f
, uint16_t g
, uint16_t h
);
39 template<typename i16x8_t
>
40 i16x8_t
FromI16(int16_t a
);
42 template<typename u16x8_t
>
43 u16x8_t
FromU16(uint16_t a
);
45 template<typename i32x4_t
>
46 i32x4_t
From32(int32_t a
, int32_t b
, int32_t c
, int32_t d
);
48 template<typename i32x4_t
>
49 i32x4_t
From32(int32_t a
);
51 template<typename f32x4_t
>
52 f32x4_t
FromF32(float a
, float b
, float c
, float d
);
54 template<typename f32x4_t
>
55 f32x4_t
FromF32(float a
);
57 // All SIMD backends overload these functions for their SIMD types:
61 // Store 16 bytes to a 16-byte aligned address
62 void Store8(uint8_t* aTarget
, u8x16_t aM
);
65 template<int32_t aNumberOfBits
> i16x8_t
ShiftRight16(i16x8_t aM
);
66 template<int32_t aNumberOfBits
> i32x4_t
ShiftRight32(i32x4_t aM
);
68 i16x8_t
Add16(i16x8_t aM1
, i16x8_t aM2
);
69 i32x4_t
Add32(i32x4_t aM1
, i32x4_t aM2
);
70 i16x8_t
Sub16(i16x8_t aM1
, i16x8_t aM2
);
71 i32x4_t
Sub32(i32x4_t aM1
, i32x4_t aM2
);
72 u8x16_t
Min8(u8x16_t aM1
, iu8x16_t aM2
);
73 u8x16_t
Max8(u8x16_t aM1
, iu8x16_t aM2
);
74 i32x4_t
Min32(i32x4_t aM1
, i32x4_t aM2
);
75 i32x4_t
Max32(i32x4_t aM1
, i32x4_t aM2
);
77 // Truncating i16 -> i16 multiplication
78 i16x8_t
Mul16(i16x8_t aM1
, i16x8_t aM2
);
80 // Long multiplication i16 -> i32
81 // aFactorsA1B1 = (a1[4] b1[4])
82 // aFactorsA2B2 = (a2[4] b2[4])
83 // aProductA = a1 * a2, aProductB = b1 * b2
84 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1
, i16x8_t aFactorsA2B2
,
85 i32x4_t
& aProductA
, i32x4_t
& aProductB
);
87 // Long multiplication + pairwise addition i16 -> i32
88 // See the scalar implementation for specifics.
89 i32x4_t
MulAdd16x8x2To32x4(i16x8_t aFactorsA
, i16x8_t aFactorsB
);
90 i32x4_t
MulAdd16x8x2To32x4(u16x8_t aFactorsA
, u16x8_t aFactorsB
);
92 // Set all four 32-bit components to the value of the component at aIndex.
93 template<int8_t aIndex
>
94 i32x4_t
Splat32(i32x4_t aM
);
96 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
97 // re-interpret the result as sixteen 8-bit values.
98 template<int8_t aIndex
>
99 u8x16_t
Splat32On8(u8x16_t aM
);
101 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i32x4
Shuffle32(i32x4 aM
);
102 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i16x8
ShuffleLo16(i16x8 aM
);
103 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
> i16x8
ShuffleHi16(i16x8 aM
);
105 u8x16_t
InterleaveLo8(u8x16_t m1
, u8x16_t m2
);
106 u8x16_t
InterleaveHi8(u8x16_t m1
, u8x16_t m2
);
107 i16x8_t
InterleaveLo16(i16x8_t m1
, i16x8_t m2
);
108 i16x8_t
InterleaveHi16(i16x8_t m1
, i16x8_t m2
);
109 i32x4_t
InterleaveLo32(i32x4_t m1
, i32x4_t m2
);
111 i16x8_t
UnpackLo8x8ToI16x8(u8x16_t m
);
112 i16x8_t
UnpackHi8x8ToI16x8(u8x16_t m
);
113 u16x8_t
UnpackLo8x8ToU16x8(u8x16_t m
);
114 u16x8_t
UnpackHi8x8ToU16x8(u8x16_t m
);
116 i16x8_t
PackAndSaturate32To16(i32x4_t m1
, i32x4_t m2
);
117 u8x16_t
PackAndSaturate16To8(i16x8_t m1
, i16x8_t m2
);
118 u8x16_t
PackAndSaturate32To8(i32x4_t m1
, i32x4_t m2
, i32x4_t m3
, const i32x4_t
& m4
);
120 i32x4
FastDivideBy255(i32x4 m
);
121 i16x8
FastDivideBy255_16(i16x8 m
);
127 struct Scalaru8x16_t
{
131 union Scalari16x8_t
{
136 typedef Scalari16x8_t Scalaru16x8_t
;
138 struct Scalari32x4_t
{
142 struct Scalarf32x4_t
{
148 Load8
<Scalaru8x16_t
>(const uint8_t* aSource
)
150 return *(Scalaru8x16_t
*)aSource
;
153 inline void Store8(uint8_t* aTarget
, Scalaru8x16_t aM
)
155 *(Scalaru8x16_t
*)aTarget
= aM
;
159 inline Scalaru8x16_t From8
<Scalaru8x16_t
>(uint8_t a
, uint8_t b
, uint8_t c
, uint8_t d
, uint8_t e
, uint8_t f
, uint8_t g
, uint8_t h
,
160 uint8_t i
, uint8_t j
, uint8_t k
, uint8_t l
, uint8_t m
, uint8_t n
, uint8_t o
, uint8_t p
)
183 inline Scalaru8x16_t FromZero8
<Scalaru8x16_t
>()
185 return From8
<Scalaru8x16_t
>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
189 inline Scalari16x8_t FromI16
<Scalari16x8_t
>(int16_t a
, int16_t b
, int16_t c
, int16_t d
, int16_t e
, int16_t f
, int16_t g
, int16_t h
)
204 inline Scalaru16x8_t FromU16
<Scalaru16x8_t
>(uint16_t a
, uint16_t b
, uint16_t c
, uint16_t d
, uint16_t e
, uint16_t f
, uint16_t g
, uint16_t h
)
219 inline Scalari16x8_t FromI16
<Scalari16x8_t
>(int16_t a
)
221 return FromI16
<Scalari16x8_t
>(a
, a
, a
, a
, a
, a
, a
, a
);
225 inline Scalaru16x8_t FromU16
<Scalaru16x8_t
>(uint16_t a
)
227 return FromU16
<Scalaru16x8_t
>(a
, a
, a
, a
, a
, a
, a
, a
);
231 inline Scalari32x4_t From32
<Scalari32x4_t
>(int32_t a
, int32_t b
, int32_t c
, int32_t d
)
242 inline Scalarf32x4_t FromF32
<Scalarf32x4_t
>(float a
, float b
, float c
, float d
)
253 inline Scalarf32x4_t FromF32
<Scalarf32x4_t
>(float a
)
255 return FromF32
<Scalarf32x4_t
>(a
, a
, a
, a
);
259 inline Scalari32x4_t From32
<Scalari32x4_t
>(int32_t a
)
261 return From32
<Scalari32x4_t
>(a
, a
, a
, a
);
264 template<int32_t aNumberOfBits
>
265 inline Scalari16x8_t
ShiftRight16(Scalari16x8_t aM
)
267 return FromI16
<Scalari16x8_t
>(uint16_t(aM
.i16
[0]) >> aNumberOfBits
, uint16_t(aM
.i16
[1]) >> aNumberOfBits
,
268 uint16_t(aM
.i16
[2]) >> aNumberOfBits
, uint16_t(aM
.i16
[3]) >> aNumberOfBits
,
269 uint16_t(aM
.i16
[4]) >> aNumberOfBits
, uint16_t(aM
.i16
[5]) >> aNumberOfBits
,
270 uint16_t(aM
.i16
[6]) >> aNumberOfBits
, uint16_t(aM
.i16
[7]) >> aNumberOfBits
);
273 template<int32_t aNumberOfBits
>
274 inline Scalari32x4_t
ShiftRight32(Scalari32x4_t aM
)
276 return From32
<Scalari32x4_t
>(aM
.i32
[0] >> aNumberOfBits
, aM
.i32
[1] >> aNumberOfBits
,
277 aM
.i32
[2] >> aNumberOfBits
, aM
.i32
[3] >> aNumberOfBits
);
280 inline Scalaru16x8_t
Add16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
)
282 return FromU16
<Scalaru16x8_t
>(aM1
.u16
[0] + aM2
.u16
[0], aM1
.u16
[1] + aM2
.u16
[1],
283 aM1
.u16
[2] + aM2
.u16
[2], aM1
.u16
[3] + aM2
.u16
[3],
284 aM1
.u16
[4] + aM2
.u16
[4], aM1
.u16
[5] + aM2
.u16
[5],
285 aM1
.u16
[6] + aM2
.u16
[6], aM1
.u16
[7] + aM2
.u16
[7]);
288 inline Scalari32x4_t
Add32(Scalari32x4_t aM1
, Scalari32x4_t aM2
)
290 return From32
<Scalari32x4_t
>(aM1
.i32
[0] + aM2
.i32
[0], aM1
.i32
[1] + aM2
.i32
[1],
291 aM1
.i32
[2] + aM2
.i32
[2], aM1
.i32
[3] + aM2
.i32
[3]);
294 inline Scalaru16x8_t
Sub16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
)
296 return FromU16
<Scalaru16x8_t
>(aM1
.u16
[0] - aM2
.u16
[0], aM1
.u16
[1] - aM2
.u16
[1],
297 aM1
.u16
[2] - aM2
.u16
[2], aM1
.u16
[3] - aM2
.u16
[3],
298 aM1
.u16
[4] - aM2
.u16
[4], aM1
.u16
[5] - aM2
.u16
[5],
299 aM1
.u16
[6] - aM2
.u16
[6], aM1
.u16
[7] - aM2
.u16
[7]);
302 inline Scalari32x4_t
Sub32(Scalari32x4_t aM1
, Scalari32x4_t aM2
)
304 return From32
<Scalari32x4_t
>(aM1
.i32
[0] - aM2
.i32
[0], aM1
.i32
[1] - aM2
.i32
[1],
305 aM1
.i32
[2] - aM2
.i32
[2], aM1
.i32
[3] - aM2
.i32
[3]);
309 umin(int32_t a
, int32_t b
)
311 return a
- ((a
- b
) & -(a
> b
));
315 umax(int32_t a
, int32_t b
)
317 return a
- ((a
- b
) & -(a
< b
));
320 inline Scalaru8x16_t
Min8(Scalaru8x16_t aM1
, Scalaru8x16_t aM2
)
322 return From8
<Scalaru8x16_t
>(umin(aM1
.u8
[0], aM2
.u8
[0]), umin(aM1
.u8
[1], aM2
.u8
[1]),
323 umin(aM1
.u8
[2], aM2
.u8
[2]), umin(aM1
.u8
[3], aM2
.u8
[3]),
324 umin(aM1
.u8
[4], aM2
.u8
[4]), umin(aM1
.u8
[5], aM2
.u8
[5]),
325 umin(aM1
.u8
[6], aM2
.u8
[6]), umin(aM1
.u8
[7], aM2
.u8
[7]),
326 umin(aM1
.u8
[8+0], aM2
.u8
[8+0]), umin(aM1
.u8
[8+1], aM2
.u8
[8+1]),
327 umin(aM1
.u8
[8+2], aM2
.u8
[8+2]), umin(aM1
.u8
[8+3], aM2
.u8
[8+3]),
328 umin(aM1
.u8
[8+4], aM2
.u8
[8+4]), umin(aM1
.u8
[8+5], aM2
.u8
[8+5]),
329 umin(aM1
.u8
[8+6], aM2
.u8
[8+6]), umin(aM1
.u8
[8+7], aM2
.u8
[8+7]));
332 inline Scalaru8x16_t
Max8(Scalaru8x16_t aM1
, Scalaru8x16_t aM2
)
334 return From8
<Scalaru8x16_t
>(umax(aM1
.u8
[0], aM2
.u8
[0]), umax(aM1
.u8
[1], aM2
.u8
[1]),
335 umax(aM1
.u8
[2], aM2
.u8
[2]), umax(aM1
.u8
[3], aM2
.u8
[3]),
336 umax(aM1
.u8
[4], aM2
.u8
[4]), umax(aM1
.u8
[5], aM2
.u8
[5]),
337 umax(aM1
.u8
[6], aM2
.u8
[6]), umax(aM1
.u8
[7], aM2
.u8
[7]),
338 umax(aM1
.u8
[8+0], aM2
.u8
[8+0]), umax(aM1
.u8
[8+1], aM2
.u8
[8+1]),
339 umax(aM1
.u8
[8+2], aM2
.u8
[8+2]), umax(aM1
.u8
[8+3], aM2
.u8
[8+3]),
340 umax(aM1
.u8
[8+4], aM2
.u8
[8+4]), umax(aM1
.u8
[8+5], aM2
.u8
[8+5]),
341 umax(aM1
.u8
[8+6], aM2
.u8
[8+6]), umax(aM1
.u8
[8+7], aM2
.u8
[8+7]));
344 inline Scalari32x4_t
Min32(Scalari32x4_t aM1
, Scalari32x4_t aM2
)
346 return From32
<Scalari32x4_t
>(umin(aM1
.i32
[0], aM2
.i32
[0]), umin(aM1
.i32
[1], aM2
.i32
[1]),
347 umin(aM1
.i32
[2], aM2
.i32
[2]), umin(aM1
.i32
[3], aM2
.i32
[3]));
350 inline Scalari32x4_t
Max32(Scalari32x4_t aM1
, Scalari32x4_t aM2
)
352 return From32
<Scalari32x4_t
>(umax(aM1
.i32
[0], aM2
.i32
[0]), umax(aM1
.i32
[1], aM2
.i32
[1]),
353 umax(aM1
.i32
[2], aM2
.i32
[2]), umax(aM1
.i32
[3], aM2
.i32
[3]));
356 inline Scalaru16x8_t
Mul16(Scalaru16x8_t aM1
, Scalaru16x8_t aM2
)
358 return FromU16
<Scalaru16x8_t
>(uint16_t(int32_t(aM1
.u16
[0]) * int32_t(aM2
.u16
[0])), uint16_t(int32_t(aM1
.u16
[1]) * int32_t(aM2
.u16
[1])),
359 uint16_t(int32_t(aM1
.u16
[2]) * int32_t(aM2
.u16
[2])), uint16_t(int32_t(aM1
.u16
[3]) * int32_t(aM2
.u16
[3])),
360 uint16_t(int32_t(aM1
.u16
[4]) * int32_t(aM2
.u16
[4])), uint16_t(int32_t(aM1
.u16
[5]) * int32_t(aM2
.u16
[5])),
361 uint16_t(int32_t(aM1
.u16
[6]) * int32_t(aM2
.u16
[6])), uint16_t(int32_t(aM1
.u16
[7]) * int32_t(aM2
.u16
[7])));
364 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1
,
365 Scalari16x8_t aFactorsA2B2
,
366 Scalari32x4_t
& aProductA
,
367 Scalari32x4_t
& aProductB
)
369 aProductA
= From32
<Scalari32x4_t
>(aFactorsA1B1
.i16
[0] * aFactorsA2B2
.i16
[0],
370 aFactorsA1B1
.i16
[1] * aFactorsA2B2
.i16
[1],
371 aFactorsA1B1
.i16
[2] * aFactorsA2B2
.i16
[2],
372 aFactorsA1B1
.i16
[3] * aFactorsA2B2
.i16
[3]);
373 aProductB
= From32
<Scalari32x4_t
>(aFactorsA1B1
.i16
[4] * aFactorsA2B2
.i16
[4],
374 aFactorsA1B1
.i16
[5] * aFactorsA2B2
.i16
[5],
375 aFactorsA1B1
.i16
[6] * aFactorsA2B2
.i16
[6],
376 aFactorsA1B1
.i16
[7] * aFactorsA2B2
.i16
[7]);
379 inline Scalari32x4_t
MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA
,
380 Scalari16x8_t aFactorsB
)
382 return From32
<Scalari32x4_t
>(aFactorsA
.i16
[0] * aFactorsB
.i16
[0] + aFactorsA
.i16
[1] * aFactorsB
.i16
[1],
383 aFactorsA
.i16
[2] * aFactorsB
.i16
[2] + aFactorsA
.i16
[3] * aFactorsB
.i16
[3],
384 aFactorsA
.i16
[4] * aFactorsB
.i16
[4] + aFactorsA
.i16
[5] * aFactorsB
.i16
[5],
385 aFactorsA
.i16
[6] * aFactorsB
.i16
[6] + aFactorsA
.i16
[7] * aFactorsB
.i16
[7]);
388 template<int8_t aIndex
>
389 inline void AssertIndex()
391 static_assert(aIndex
== 0 || aIndex
== 1 || aIndex
== 2 || aIndex
== 3,
392 "Invalid splat index");
395 template<int8_t aIndex
>
396 inline Scalari32x4_t
Splat32(Scalari32x4_t aM
)
398 AssertIndex
<aIndex
>();
399 return From32
<Scalari32x4_t
>(aM
.i32
[aIndex
], aM
.i32
[aIndex
],
400 aM
.i32
[aIndex
], aM
.i32
[aIndex
]);
404 inline Scalaru8x16_t
Splat32On8(Scalaru8x16_t aM
)
407 return From8
<Scalaru8x16_t
>(aM
.u8
[i
*4], aM
.u8
[i
*4+1], aM
.u8
[i
*4+2], aM
.u8
[i
*4+3],
408 aM
.u8
[i
*4], aM
.u8
[i
*4+1], aM
.u8
[i
*4+2], aM
.u8
[i
*4+3],
409 aM
.u8
[i
*4], aM
.u8
[i
*4+1], aM
.u8
[i
*4+2], aM
.u8
[i
*4+3],
410 aM
.u8
[i
*4], aM
.u8
[i
*4+1], aM
.u8
[i
*4+2], aM
.u8
[i
*4+3]);
413 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
414 inline Scalari32x4_t
Shuffle32(Scalari32x4_t aM
)
420 Scalari32x4_t m
= aM
;
421 m
.i32
[0] = aM
.i32
[i3
];
422 m
.i32
[1] = aM
.i32
[i2
];
423 m
.i32
[2] = aM
.i32
[i1
];
424 m
.i32
[3] = aM
.i32
[i0
];
428 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
429 inline Scalari16x8_t
ShuffleLo16(Scalari16x8_t aM
)
435 Scalari16x8_t m
= aM
;
436 m
.i16
[0] = aM
.i16
[i3
];
437 m
.i16
[1] = aM
.i16
[i2
];
438 m
.i16
[2] = aM
.i16
[i1
];
439 m
.i16
[3] = aM
.i16
[i0
];
443 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
444 inline Scalari16x8_t
ShuffleHi16(Scalari16x8_t aM
)
450 Scalari16x8_t m
= aM
;
451 m
.i16
[4 + 0] = aM
.i16
[4 + i3
];
452 m
.i16
[4 + 1] = aM
.i16
[4 + i2
];
453 m
.i16
[4 + 2] = aM
.i16
[4 + i1
];
454 m
.i16
[4 + 3] = aM
.i16
[4 + i0
];
458 template<int8_t aIndexLo
, int8_t aIndexHi
>
459 inline Scalaru16x8_t
Splat16(Scalaru16x8_t aM
)
461 AssertIndex
<aIndexLo
>();
462 AssertIndex
<aIndexHi
>();
464 int16_t chosenValueLo
= aM
.u16
[aIndexLo
];
465 m
.u16
[0] = chosenValueLo
;
466 m
.u16
[1] = chosenValueLo
;
467 m
.u16
[2] = chosenValueLo
;
468 m
.u16
[3] = chosenValueLo
;
469 int16_t chosenValueHi
= aM
.u16
[4 + aIndexHi
];
470 m
.u16
[4] = chosenValueHi
;
471 m
.u16
[5] = chosenValueHi
;
472 m
.u16
[6] = chosenValueHi
;
473 m
.u16
[7] = chosenValueHi
;
478 InterleaveLo8(Scalaru8x16_t m1
, Scalaru8x16_t m2
)
480 return From8
<Scalaru8x16_t
>(m1
.u8
[0], m2
.u8
[0], m1
.u8
[1], m2
.u8
[1],
481 m1
.u8
[2], m2
.u8
[2], m1
.u8
[3], m2
.u8
[3],
482 m1
.u8
[4], m2
.u8
[4], m1
.u8
[5], m2
.u8
[5],
483 m1
.u8
[6], m2
.u8
[6], m1
.u8
[7], m2
.u8
[7]);
487 InterleaveHi8(Scalaru8x16_t m1
, Scalaru8x16_t m2
)
489 return From8
<Scalaru8x16_t
>(m1
.u8
[8+0], m2
.u8
[8+0], m1
.u8
[8+1], m2
.u8
[8+1],
490 m1
.u8
[8+2], m2
.u8
[8+2], m1
.u8
[8+3], m2
.u8
[8+3],
491 m1
.u8
[8+4], m2
.u8
[8+4], m1
.u8
[8+5], m2
.u8
[8+5],
492 m1
.u8
[8+6], m2
.u8
[8+6], m1
.u8
[8+7], m2
.u8
[8+7]);
496 InterleaveLo16(Scalaru16x8_t m1
, Scalaru16x8_t m2
)
498 return FromU16
<Scalaru16x8_t
>(m1
.u16
[0], m2
.u16
[0], m1
.u16
[1], m2
.u16
[1],
499 m1
.u16
[2], m2
.u16
[2], m1
.u16
[3], m2
.u16
[3]);
503 InterleaveHi16(Scalaru16x8_t m1
, Scalaru16x8_t m2
)
505 return FromU16
<Scalaru16x8_t
>(m1
.u16
[4], m2
.u16
[4], m1
.u16
[5], m2
.u16
[5],
506 m1
.u16
[6], m2
.u16
[6], m1
.u16
[7], m2
.u16
[7]);
510 InterleaveLo32(Scalari32x4_t m1
, Scalari32x4_t m2
)
512 return From32
<Scalari32x4_t
>(m1
.i32
[0], m2
.i32
[0], m1
.i32
[1], m2
.i32
[1]);
516 UnpackLo8x8ToI16x8(Scalaru8x16_t aM
)
531 UnpackHi8x8ToI16x8(Scalaru8x16_t aM
)
534 m
.i16
[0] = aM
.u8
[8+0];
535 m
.i16
[1] = aM
.u8
[8+1];
536 m
.i16
[2] = aM
.u8
[8+2];
537 m
.i16
[3] = aM
.u8
[8+3];
538 m
.i16
[4] = aM
.u8
[8+4];
539 m
.i16
[5] = aM
.u8
[8+5];
540 m
.i16
[6] = aM
.u8
[8+6];
541 m
.i16
[7] = aM
.u8
[8+7];
546 UnpackLo8x8ToU16x8(Scalaru8x16_t aM
)
548 return FromU16
<Scalaru16x8_t
>(uint16_t(aM
.u8
[0]), uint16_t(aM
.u8
[1]), uint16_t(aM
.u8
[2]), uint16_t(aM
.u8
[3]),
549 uint16_t(aM
.u8
[4]), uint16_t(aM
.u8
[5]), uint16_t(aM
.u8
[6]), uint16_t(aM
.u8
[7]));
553 UnpackHi8x8ToU16x8(Scalaru8x16_t aM
)
555 return FromU16
<Scalaru16x8_t
>(aM
.u8
[8+0], aM
.u8
[8+1], aM
.u8
[8+2], aM
.u8
[8+3],
556 aM
.u8
[8+4], aM
.u8
[8+5], aM
.u8
[8+6], aM
.u8
[8+7]);
559 template<uint8_t aNumBytes
>
561 Rotate8(Scalaru8x16_t a1234
, Scalaru8x16_t a5678
)
564 for (uint8_t i
= 0; i
< 16; i
++) {
565 uint8_t sourceByte
= i
+ aNumBytes
;
566 m
.u8
[i
] = sourceByte
< 16 ? a1234
.u8
[sourceByte
] : a5678
.u8
[sourceByte
- 16];
575 return int16_t(a
>= INT16_MIN
? (a
<= INT16_MAX
? a
: INT16_MAX
) : INT16_MIN
);
579 PackAndSaturate32To16(Scalari32x4_t m1
, Scalari32x4_t m2
)
582 m
.i16
[0] = SaturateTo16(m1
.i32
[0]);
583 m
.i16
[1] = SaturateTo16(m1
.i32
[1]);
584 m
.i16
[2] = SaturateTo16(m1
.i32
[2]);
585 m
.i16
[3] = SaturateTo16(m1
.i32
[3]);
586 m
.i16
[4] = SaturateTo16(m2
.i32
[0]);
587 m
.i16
[5] = SaturateTo16(m2
.i32
[1]);
588 m
.i16
[6] = SaturateTo16(m2
.i32
[2]);
589 m
.i16
[7] = SaturateTo16(m2
.i32
[3]);
597 return uint16_t(umin(a
& -(a
>= 0), INT16_MAX
));
601 PackAndSaturate32ToU16(Scalari32x4_t m1
, Scalari32x4_t m2
)
604 m
.u16
[0] = SaturateToU16(m1
.i32
[0]);
605 m
.u16
[1] = SaturateToU16(m1
.i32
[1]);
606 m
.u16
[2] = SaturateToU16(m1
.i32
[2]);
607 m
.u16
[3] = SaturateToU16(m1
.i32
[3]);
608 m
.u16
[4] = SaturateToU16(m2
.i32
[0]);
609 m
.u16
[5] = SaturateToU16(m2
.i32
[1]);
610 m
.u16
[6] = SaturateToU16(m2
.i32
[2]);
611 m
.u16
[7] = SaturateToU16(m2
.i32
[3]);
619 return uint8_t(umin(a
& -(a
>= 0), 255));
623 PackAndSaturate32To8(Scalari32x4_t m1
, Scalari32x4_t m2
, Scalari32x4_t m3
, const Scalari32x4_t
& m4
)
626 m
.u8
[0] = SaturateTo8(m1
.i32
[0]);
627 m
.u8
[1] = SaturateTo8(m1
.i32
[1]);
628 m
.u8
[2] = SaturateTo8(m1
.i32
[2]);
629 m
.u8
[3] = SaturateTo8(m1
.i32
[3]);
630 m
.u8
[4] = SaturateTo8(m2
.i32
[0]);
631 m
.u8
[5] = SaturateTo8(m2
.i32
[1]);
632 m
.u8
[6] = SaturateTo8(m2
.i32
[2]);
633 m
.u8
[7] = SaturateTo8(m2
.i32
[3]);
634 m
.u8
[8] = SaturateTo8(m3
.i32
[0]);
635 m
.u8
[9] = SaturateTo8(m3
.i32
[1]);
636 m
.u8
[10] = SaturateTo8(m3
.i32
[2]);
637 m
.u8
[11] = SaturateTo8(m3
.i32
[3]);
638 m
.u8
[12] = SaturateTo8(m4
.i32
[0]);
639 m
.u8
[13] = SaturateTo8(m4
.i32
[1]);
640 m
.u8
[14] = SaturateTo8(m4
.i32
[2]);
641 m
.u8
[15] = SaturateTo8(m4
.i32
[3]);
646 PackAndSaturate16To8(Scalari16x8_t m1
, Scalari16x8_t m2
)
649 m
.u8
[0] = SaturateTo8(m1
.i16
[0]);
650 m
.u8
[1] = SaturateTo8(m1
.i16
[1]);
651 m
.u8
[2] = SaturateTo8(m1
.i16
[2]);
652 m
.u8
[3] = SaturateTo8(m1
.i16
[3]);
653 m
.u8
[4] = SaturateTo8(m1
.i16
[4]);
654 m
.u8
[5] = SaturateTo8(m1
.i16
[5]);
655 m
.u8
[6] = SaturateTo8(m1
.i16
[6]);
656 m
.u8
[7] = SaturateTo8(m1
.i16
[7]);
657 m
.u8
[8] = SaturateTo8(m2
.i16
[0]);
658 m
.u8
[9] = SaturateTo8(m2
.i16
[1]);
659 m
.u8
[10] = SaturateTo8(m2
.i16
[2]);
660 m
.u8
[11] = SaturateTo8(m2
.i16
[3]);
661 m
.u8
[12] = SaturateTo8(m2
.i16
[4]);
662 m
.u8
[13] = SaturateTo8(m2
.i16
[5]);
663 m
.u8
[14] = SaturateTo8(m2
.i16
[6]);
664 m
.u8
[15] = SaturateTo8(m2
.i16
[7]);
668 // Fast approximate division by 255. It has the property that
669 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
670 // But it only uses two adds and two shifts instead of an
671 // integer division (which is expensive on many processors).
673 // equivalent to v/255
674 template<class B
, class A
>
675 inline B
FastDivideBy255(A v
)
677 return ((v
<< 8) + v
+ 255) >> 16;
681 FastDivideBy255_16(Scalaru16x8_t m
)
683 return FromU16
<Scalaru16x8_t
>(FastDivideBy255
<uint16_t>(int32_t(m
.u16
[0])),
684 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[1])),
685 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[2])),
686 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[3])),
687 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[4])),
688 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[5])),
689 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[6])),
690 FastDivideBy255
<uint16_t>(int32_t(m
.u16
[7])));
694 FastDivideBy255(Scalari32x4_t m
)
696 return From32
<Scalari32x4_t
>(FastDivideBy255
<int32_t>(m
.i32
[0]),
697 FastDivideBy255
<int32_t>(m
.i32
[1]),
698 FastDivideBy255
<int32_t>(m
.i32
[2]),
699 FastDivideBy255
<int32_t>(m
.i32
[3]));
703 Pick(Scalaru8x16_t mask
, Scalaru8x16_t a
, Scalaru8x16_t b
)
705 return From8
<Scalaru8x16_t
>((a
.u8
[0] & (~mask
.u8
[0])) | (b
.u8
[0] & mask
.u8
[0]),
706 (a
.u8
[1] & (~mask
.u8
[1])) | (b
.u8
[1] & mask
.u8
[1]),
707 (a
.u8
[2] & (~mask
.u8
[2])) | (b
.u8
[2] & mask
.u8
[2]),
708 (a
.u8
[3] & (~mask
.u8
[3])) | (b
.u8
[3] & mask
.u8
[3]),
709 (a
.u8
[4] & (~mask
.u8
[4])) | (b
.u8
[4] & mask
.u8
[4]),
710 (a
.u8
[5] & (~mask
.u8
[5])) | (b
.u8
[5] & mask
.u8
[5]),
711 (a
.u8
[6] & (~mask
.u8
[6])) | (b
.u8
[6] & mask
.u8
[6]),
712 (a
.u8
[7] & (~mask
.u8
[7])) | (b
.u8
[7] & mask
.u8
[7]),
713 (a
.u8
[8+0] & (~mask
.u8
[8+0])) | (b
.u8
[8+0] & mask
.u8
[8+0]),
714 (a
.u8
[8+1] & (~mask
.u8
[8+1])) | (b
.u8
[8+1] & mask
.u8
[8+1]),
715 (a
.u8
[8+2] & (~mask
.u8
[8+2])) | (b
.u8
[8+2] & mask
.u8
[8+2]),
716 (a
.u8
[8+3] & (~mask
.u8
[8+3])) | (b
.u8
[8+3] & mask
.u8
[8+3]),
717 (a
.u8
[8+4] & (~mask
.u8
[8+4])) | (b
.u8
[8+4] & mask
.u8
[8+4]),
718 (a
.u8
[8+5] & (~mask
.u8
[8+5])) | (b
.u8
[8+5] & mask
.u8
[8+5]),
719 (a
.u8
[8+6] & (~mask
.u8
[8+6])) | (b
.u8
[8+6] & mask
.u8
[8+6]),
720 (a
.u8
[8+7] & (~mask
.u8
[8+7])) | (b
.u8
[8+7] & mask
.u8
[8+7]));
724 Pick(Scalari32x4_t mask
, Scalari32x4_t a
, Scalari32x4_t b
)
726 return From32
<Scalari32x4_t
>((a
.i32
[0] & (~mask
.i32
[0])) | (b
.i32
[0] & mask
.i32
[0]),
727 (a
.i32
[1] & (~mask
.i32
[1])) | (b
.i32
[1] & mask
.i32
[1]),
728 (a
.i32
[2] & (~mask
.i32
[2])) | (b
.i32
[2] & mask
.i32
[2]),
729 (a
.i32
[3] & (~mask
.i32
[3])) | (b
.i32
[3] & mask
.i32
[3]));
732 inline Scalarf32x4_t
MixF32(Scalarf32x4_t a
, Scalarf32x4_t b
, float t
)
734 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] + (b
.f32
[0] - a
.f32
[0]) * t
,
735 a
.f32
[1] + (b
.f32
[1] - a
.f32
[1]) * t
,
736 a
.f32
[2] + (b
.f32
[2] - a
.f32
[2]) * t
,
737 a
.f32
[3] + (b
.f32
[3] - a
.f32
[3]) * t
);
740 inline Scalarf32x4_t
WSumF32(Scalarf32x4_t a
, Scalarf32x4_t b
, float wa
, float wb
)
742 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] * wa
+ b
.f32
[0] * wb
,
743 a
.f32
[1] * wa
+ b
.f32
[1] * wb
,
744 a
.f32
[2] * wa
+ b
.f32
[2] * wb
,
745 a
.f32
[3] * wa
+ b
.f32
[3] * wb
);
748 inline Scalarf32x4_t
AbsF32(Scalarf32x4_t a
)
750 return FromF32
<Scalarf32x4_t
>(fabs(a
.f32
[0]),
756 inline Scalarf32x4_t
AddF32(Scalarf32x4_t a
, Scalarf32x4_t b
)
758 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] + b
.f32
[0],
761 a
.f32
[3] + b
.f32
[3]);
764 inline Scalarf32x4_t
MulF32(Scalarf32x4_t a
, Scalarf32x4_t b
)
766 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] * b
.f32
[0],
769 a
.f32
[3] * b
.f32
[3]);
772 inline Scalarf32x4_t
DivF32(Scalarf32x4_t a
, Scalarf32x4_t b
)
774 return FromF32
<Scalarf32x4_t
>(a
.f32
[0] / b
.f32
[0],
777 a
.f32
[3] / b
.f32
[3]);
780 template<uint8_t aIndex
>
781 inline Scalarf32x4_t
SplatF32(Scalarf32x4_t m
)
783 AssertIndex
<aIndex
>();
784 return FromF32
<Scalarf32x4_t
>(m
.f32
[aIndex
],
790 inline Scalari32x4_t
F32ToI32(Scalarf32x4_t m
)
792 return From32
<Scalari32x4_t
>(int32_t(floor(m
.f32
[0] + 0.5f
)),
793 int32_t(floor(m
.f32
[1] + 0.5f
)),
794 int32_t(floor(m
.f32
[2] + 0.5f
)),
795 int32_t(floor(m
.f32
[3] + 0.5f
)));
798 #ifdef SIMD_COMPILE_SSE2
804 Load8
<__m128i
>(const uint8_t* aSource
)
806 return _mm_load_si128((const __m128i
*)aSource
);
809 inline void Store8(uint8_t* aTarget
, __m128i aM
)
811 _mm_store_si128((__m128i
*)aTarget
, aM
);
815 inline __m128i FromZero8
<__m128i
>()
817 return _mm_setzero_si128();
821 inline __m128i From8
<__m128i
>(uint8_t a
, uint8_t b
, uint8_t c
, uint8_t d
, uint8_t e
, uint8_t f
, uint8_t g
, uint8_t h
,
822 uint8_t i
, uint8_t j
, uint8_t k
, uint8_t l
, uint8_t m
, uint8_t n
, uint8_t o
, uint8_t p
)
824 return _mm_setr_epi16((b
<< 8) + a
, (d
<< 8) + c
, (e
<< 8) + f
, (h
<< 8) + g
,
825 (j
<< 8) + i
, (l
<< 8) + k
, (m
<< 8) + n
, (p
<< 8) + o
);
829 inline __m128i FromI16
<__m128i
>(int16_t a
, int16_t b
, int16_t c
, int16_t d
, int16_t e
, int16_t f
, int16_t g
, int16_t h
)
831 return _mm_setr_epi16(a
, b
, c
, d
, e
, f
, g
, h
);
835 inline __m128i FromU16
<__m128i
>(uint16_t a
, uint16_t b
, uint16_t c
, uint16_t d
, uint16_t e
, uint16_t f
, uint16_t g
, uint16_t h
)
837 return _mm_setr_epi16(a
, b
, c
, d
, e
, f
, g
, h
);
841 inline __m128i FromI16
<__m128i
>(int16_t a
)
843 return _mm_set1_epi16(a
);
847 inline __m128i FromU16
<__m128i
>(uint16_t a
)
849 return _mm_set1_epi16((int16_t)a
);
853 inline __m128i From32
<__m128i
>(int32_t a
, int32_t b
, int32_t c
, int32_t d
)
855 return _mm_setr_epi32(a
, b
, c
, d
);
859 inline __m128i From32
<__m128i
>(int32_t a
)
861 return _mm_set1_epi32(a
);
865 inline __m128 FromF32
<__m128
>(float a
, float b
, float c
, float d
)
867 return _mm_setr_ps(a
, b
, c
, d
);
871 inline __m128 FromF32
<__m128
>(float a
)
873 return _mm_set1_ps(a
);
876 template<int32_t aNumberOfBits
>
877 inline __m128i
ShiftRight16(__m128i aM
)
879 return _mm_srli_epi16(aM
, aNumberOfBits
);
882 template<int32_t aNumberOfBits
>
883 inline __m128i
ShiftRight32(__m128i aM
)
885 return _mm_srai_epi32(aM
, aNumberOfBits
);
888 inline __m128i
Add16(__m128i aM1
, __m128i aM2
)
890 return _mm_add_epi16(aM1
, aM2
);
893 inline __m128i
Add32(__m128i aM1
, __m128i aM2
)
895 return _mm_add_epi32(aM1
, aM2
);
898 inline __m128i
Sub16(__m128i aM1
, __m128i aM2
)
900 return _mm_sub_epi16(aM1
, aM2
);
903 inline __m128i
Sub32(__m128i aM1
, __m128i aM2
)
905 return _mm_sub_epi32(aM1
, aM2
);
908 inline __m128i
Min8(__m128i aM1
, __m128i aM2
)
910 return _mm_min_epu8(aM1
, aM2
);
913 inline __m128i
Max8(__m128i aM1
, __m128i aM2
)
915 return _mm_max_epu8(aM1
, aM2
);
918 inline __m128i
Min32(__m128i aM1
, __m128i aM2
)
920 __m128i m1_minus_m2
= _mm_sub_epi32(aM1
, aM2
);
921 __m128i m1_greater_than_m2
= _mm_cmpgt_epi32(aM1
, aM2
);
922 return _mm_sub_epi32(aM1
, _mm_and_si128(m1_minus_m2
, m1_greater_than_m2
));
925 inline __m128i
Max32(__m128i aM1
, __m128i aM2
)
927 __m128i m1_minus_m2
= _mm_sub_epi32(aM1
, aM2
);
928 __m128i m2_greater_than_m1
= _mm_cmpgt_epi32(aM2
, aM1
);
929 return _mm_sub_epi32(aM1
, _mm_and_si128(m1_minus_m2
, m2_greater_than_m1
));
932 inline __m128i
Mul16(__m128i aM1
, __m128i aM2
)
934 return _mm_mullo_epi16(aM1
, aM2
);
937 inline __m128i
MulU16(__m128i aM1
, __m128i aM2
)
939 return _mm_mullo_epi16(aM1
, aM2
);
942 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1
,
943 __m128i aFactorsA2B2
,
947 __m128i prodAB_lo
= _mm_mullo_epi16(aFactorsA1B1
, aFactorsA2B2
);
948 __m128i prodAB_hi
= _mm_mulhi_epi16(aFactorsA1B1
, aFactorsA2B2
);
949 aProductA
= _mm_unpacklo_epi16(prodAB_lo
, prodAB_hi
);
950 aProductB
= _mm_unpackhi_epi16(prodAB_lo
, prodAB_hi
);
953 inline __m128i
MulAdd16x8x2To32x4(__m128i aFactorsA
,
956 return _mm_madd_epi16(aFactorsA
, aFactorsB
);
959 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
960 inline __m128i
Shuffle32(__m128i aM
)
966 return _mm_shuffle_epi32(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
969 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
970 inline __m128i
ShuffleLo16(__m128i aM
)
976 return _mm_shufflelo_epi16(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
979 template<int8_t i0
, int8_t i1
, int8_t i2
, int8_t i3
>
980 inline __m128i
ShuffleHi16(__m128i aM
)
986 return _mm_shufflehi_epi16(aM
, _MM_SHUFFLE(i0
, i1
, i2
, i3
));
989 template<int8_t aIndex
>
990 inline __m128i
Splat32(__m128i aM
)
992 return Shuffle32
<aIndex
,aIndex
,aIndex
,aIndex
>(aM
);
995 template<int8_t aIndex
>
996 inline __m128i
Splat32On8(__m128i aM
)
998 return Shuffle32
<aIndex
,aIndex
,aIndex
,aIndex
>(aM
);
1001 template<int8_t aIndexLo
, int8_t aIndexHi
>
1002 inline __m128i
Splat16(__m128i aM
)
1004 AssertIndex
<aIndexLo
>();
1005 AssertIndex
<aIndexHi
>();
1006 return ShuffleHi16
<aIndexHi
,aIndexHi
,aIndexHi
,aIndexHi
>(
1007 ShuffleLo16
<aIndexLo
,aIndexLo
,aIndexLo
,aIndexLo
>(aM
));
1011 UnpackLo8x8ToI16x8(__m128i m
)
1013 __m128i zero
= _mm_set1_epi8(0);
1014 return _mm_unpacklo_epi8(m
, zero
);
1018 UnpackHi8x8ToI16x8(__m128i m
)
1020 __m128i zero
= _mm_set1_epi8(0);
1021 return _mm_unpackhi_epi8(m
, zero
);
1025 UnpackLo8x8ToU16x8(__m128i m
)
1027 __m128i zero
= _mm_set1_epi8(0);
1028 return _mm_unpacklo_epi8(m
, zero
);
1032 UnpackHi8x8ToU16x8(__m128i m
)
1034 __m128i zero
= _mm_set1_epi8(0);
1035 return _mm_unpackhi_epi8(m
, zero
);
1039 InterleaveLo8(__m128i m1
, __m128i m2
)
1041 return _mm_unpacklo_epi8(m1
, m2
);
1045 InterleaveHi8(__m128i m1
, __m128i m2
)
1047 return _mm_unpackhi_epi8(m1
, m2
);
1051 InterleaveLo16(__m128i m1
, __m128i m2
)
1053 return _mm_unpacklo_epi16(m1
, m2
);
1057 InterleaveHi16(__m128i m1
, __m128i m2
)
1059 return _mm_unpackhi_epi16(m1
, m2
);
1063 InterleaveLo32(__m128i m1
, __m128i m2
)
1065 return _mm_unpacklo_epi32(m1
, m2
);
1068 template<uint8_t aNumBytes
>
1070 Rotate8(__m128i a1234
, __m128i a5678
)
1072 return _mm_or_si128(_mm_srli_si128(a1234
, aNumBytes
), _mm_slli_si128(a5678
, 16 - aNumBytes
));
1076 PackAndSaturate32To16(__m128i m1
, __m128i m2
)
1078 return _mm_packs_epi32(m1
, m2
);
1082 PackAndSaturate32ToU16(__m128i m1
, __m128i m2
)
1084 return _mm_packs_epi32(m1
, m2
);
1088 PackAndSaturate32To8(__m128i m1
, __m128i m2
, __m128i m3
, const __m128i
& m4
)
1090 // Pack into 8 16bit signed integers (saturating).
1091 __m128i m12
= _mm_packs_epi32(m1
, m2
);
1092 __m128i m34
= _mm_packs_epi32(m3
, m4
);
1094 // Pack into 16 8bit unsigned integers (saturating).
1095 return _mm_packus_epi16(m12
, m34
);
1099 PackAndSaturate16To8(__m128i m1
, __m128i m2
)
1101 // Pack into 16 8bit unsigned integers (saturating).
1102 return _mm_packus_epi16(m1
, m2
);
1106 FastDivideBy255(__m128i m
)
1109 __m128i v
= _mm_slli_epi32(m
, 8);
1110 // v = v + (m + (255,255,255,255))
1111 v
= _mm_add_epi32(v
, _mm_add_epi32(m
, _mm_set1_epi32(255)));
1113 return _mm_srai_epi32(v
, 16);
1117 FastDivideBy255_16(__m128i m
)
1119 __m128i zero
= _mm_set1_epi16(0);
1120 __m128i lo
= _mm_unpacklo_epi16(m
, zero
);
1121 __m128i hi
= _mm_unpackhi_epi16(m
, zero
);
1122 return _mm_packs_epi32(FastDivideBy255(lo
), FastDivideBy255(hi
));
1126 Pick(__m128i mask
, __m128i a
, __m128i b
)
1128 return _mm_or_si128(_mm_andnot_si128(mask
, a
), _mm_and_si128(mask
, b
));
1131 inline __m128
MixF32(__m128 a
, __m128 b
, float t
)
1133 return _mm_add_ps(a
, _mm_mul_ps(_mm_sub_ps(b
, a
), _mm_set1_ps(t
)));
1136 inline __m128
WSumF32(__m128 a
, __m128 b
, float wa
, float wb
)
1138 return _mm_add_ps(_mm_mul_ps(a
, _mm_set1_ps(wa
)), _mm_mul_ps(b
, _mm_set1_ps(wb
)));
1141 inline __m128
AbsF32(__m128 a
)
1143 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a
), a
);
1146 inline __m128
AddF32(__m128 a
, __m128 b
)
1148 return _mm_add_ps(a
, b
);
1151 inline __m128
MulF32(__m128 a
, __m128 b
)
1153 return _mm_mul_ps(a
, b
);
1156 inline __m128
DivF32(__m128 a
, __m128 b
)
1158 return _mm_div_ps(a
, b
);
1161 template<uint8_t aIndex
>
1162 inline __m128
SplatF32(__m128 m
)
1164 AssertIndex
<aIndex
>();
1165 return _mm_shuffle_ps(m
, m
, _MM_SHUFFLE(aIndex
, aIndex
, aIndex
, aIndex
));
1168 inline __m128i
F32ToI32(__m128 m
)
1170 return _mm_cvtps_epi32(m
);
1173 #endif // SIMD_COMPILE_SSE2
1178 } // namespace mozilla
1180 #endif // _MOZILLA_GFX_SIMD_H_