Bumping manifests a=b2g-bump
[gecko.git] / gfx / 2d / SIMD.h
blob6bf53a38e0c94bbe77c6dbb83135a55d48157741
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6 #ifndef _MOZILLA_GFX_SIMD_H_
7 #define _MOZILLA_GFX_SIMD_H_
9 /**
10 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
11 * if they want access to the SSE2 functions.
14 #ifdef SIMD_COMPILE_SSE2
15 #include <xmmintrin.h>
16 #endif
18 namespace mozilla {
19 namespace gfx {
21 namespace simd {
23 template<typename u8x16_t>
24 u8x16_t Load8(const uint8_t* aSource);
26 template<typename u8x16_t>
27 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
28 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p);
30 template<typename u8x16_t>
31 u8x16_t FromZero8();
33 template<typename i16x8_t>
34 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h);
36 template<typename u16x8_t>
37 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h);
39 template<typename i16x8_t>
40 i16x8_t FromI16(int16_t a);
42 template<typename u16x8_t>
43 u16x8_t FromU16(uint16_t a);
45 template<typename i32x4_t>
46 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
48 template<typename i32x4_t>
49 i32x4_t From32(int32_t a);
51 template<typename f32x4_t>
52 f32x4_t FromF32(float a, float b, float c, float d);
54 template<typename f32x4_t>
55 f32x4_t FromF32(float a);
57 // All SIMD backends overload these functions for their SIMD types:
59 #if 0
61 // Store 16 bytes to a 16-byte aligned address
62 void Store8(uint8_t* aTarget, u8x16_t aM);
64 // Fixed shifts
65 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
66 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
68 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
69 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
70 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
71 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
72 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
73 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
74 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
75 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
77 // Truncating i16 -> i16 multiplication
78 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
80 // Long multiplication i16 -> i32
81 // aFactorsA1B1 = (a1[4] b1[4])
82 // aFactorsA2B2 = (a2[4] b2[4])
83 // aProductA = a1 * a2, aProductB = b1 * b2
84 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
85 i32x4_t& aProductA, i32x4_t& aProductB);
87 // Long multiplication + pairwise addition i16 -> i32
88 // See the scalar implementation for specifics.
89 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
90 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
92 // Set all four 32-bit components to the value of the component at aIndex.
93 template<int8_t aIndex>
94 i32x4_t Splat32(i32x4_t aM);
96 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
97 // re-interpret the result as sixteen 8-bit values.
98 template<int8_t aIndex>
99 u8x16_t Splat32On8(u8x16_t aM);
101 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
102 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
103 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
105 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
106 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
107 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
108 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
109 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
111 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
112 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
113 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
114 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
116 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
117 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
118 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
120 i32x4 FastDivideBy255(i32x4 m);
121 i16x8 FastDivideBy255_16(i16x8 m);
123 #endif
125 // Scalar
127 struct Scalaru8x16_t {
128 uint8_t u8[16];
131 union Scalari16x8_t {
132 int16_t i16[8];
133 uint16_t u16[8];
136 typedef Scalari16x8_t Scalaru16x8_t;
138 struct Scalari32x4_t {
139 int32_t i32[4];
142 struct Scalarf32x4_t {
143 float f32[4];
146 template<>
147 inline Scalaru8x16_t
148 Load8<Scalaru8x16_t>(const uint8_t* aSource)
150 return *(Scalaru8x16_t*)aSource;
153 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM)
155 *(Scalaru8x16_t*)aTarget = aM;
158 template<>
159 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
160 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
162 Scalaru8x16_t _m;
163 _m.u8[0] = a;
164 _m.u8[1] = b;
165 _m.u8[2] = c;
166 _m.u8[3] = d;
167 _m.u8[4] = e;
168 _m.u8[5] = f;
169 _m.u8[6] = g;
170 _m.u8[7] = h;
171 _m.u8[8+0] = i;
172 _m.u8[8+1] = j;
173 _m.u8[8+2] = k;
174 _m.u8[8+3] = l;
175 _m.u8[8+4] = m;
176 _m.u8[8+5] = n;
177 _m.u8[8+6] = o;
178 _m.u8[8+7] = p;
179 return _m;
182 template<>
183 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>()
185 return From8<Scalaru8x16_t>(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
188 template<>
189 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
191 Scalari16x8_t m;
192 m.i16[0] = a;
193 m.i16[1] = b;
194 m.i16[2] = c;
195 m.i16[3] = d;
196 m.i16[4] = e;
197 m.i16[5] = f;
198 m.i16[6] = g;
199 m.i16[7] = h;
200 return m;
203 template<>
204 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
206 Scalaru16x8_t m;
207 m.u16[0] = a;
208 m.u16[1] = b;
209 m.u16[2] = c;
210 m.u16[3] = d;
211 m.u16[4] = e;
212 m.u16[5] = f;
213 m.u16[6] = g;
214 m.u16[7] = h;
215 return m;
218 template<>
219 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a)
221 return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
224 template<>
225 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a)
227 return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
230 template<>
231 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, int32_t d)
233 Scalari32x4_t m;
234 m.i32[0] = a;
235 m.i32[1] = b;
236 m.i32[2] = c;
237 m.i32[3] = d;
238 return m;
241 template<>
242 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, float d)
244 Scalarf32x4_t m;
245 m.f32[0] = a;
246 m.f32[1] = b;
247 m.f32[2] = c;
248 m.f32[3] = d;
249 return m;
252 template<>
253 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a)
255 return FromF32<Scalarf32x4_t>(a, a, a, a);
258 template<>
259 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a)
261 return From32<Scalari32x4_t>(a, a, a, a);
264 template<int32_t aNumberOfBits>
265 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM)
267 return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, uint16_t(aM.i16[1]) >> aNumberOfBits,
268 uint16_t(aM.i16[2]) >> aNumberOfBits, uint16_t(aM.i16[3]) >> aNumberOfBits,
269 uint16_t(aM.i16[4]) >> aNumberOfBits, uint16_t(aM.i16[5]) >> aNumberOfBits,
270 uint16_t(aM.i16[6]) >> aNumberOfBits, uint16_t(aM.i16[7]) >> aNumberOfBits);
273 template<int32_t aNumberOfBits>
274 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM)
276 return From32<Scalari32x4_t>(aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
277 aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
280 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
282 return FromU16<Scalaru16x8_t>(aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1],
283 aM1.u16[2] + aM2.u16[2], aM1.u16[3] + aM2.u16[3],
284 aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
285 aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
288 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2)
290 return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
291 aM1.i32[2] + aM2.i32[2], aM1.i32[3] + aM2.i32[3]);
294 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
296 return FromU16<Scalaru16x8_t>(aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1],
297 aM1.u16[2] - aM2.u16[2], aM1.u16[3] - aM2.u16[3],
298 aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
299 aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
302 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2)
304 return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
305 aM1.i32[2] - aM2.i32[2], aM1.i32[3] - aM2.i32[3]);
308 inline int32_t
309 umin(int32_t a, int32_t b)
311 return a - ((a - b) & -(a > b));
314 inline int32_t
315 umax(int32_t a, int32_t b)
317 return a - ((a - b) & -(a < b));
320 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
322 return From8<Scalaru8x16_t>(umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
323 umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
324 umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
325 umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
326 umin(aM1.u8[8+0], aM2.u8[8+0]), umin(aM1.u8[8+1], aM2.u8[8+1]),
327 umin(aM1.u8[8+2], aM2.u8[8+2]), umin(aM1.u8[8+3], aM2.u8[8+3]),
328 umin(aM1.u8[8+4], aM2.u8[8+4]), umin(aM1.u8[8+5], aM2.u8[8+5]),
329 umin(aM1.u8[8+6], aM2.u8[8+6]), umin(aM1.u8[8+7], aM2.u8[8+7]));
332 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2)
334 return From8<Scalaru8x16_t>(umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
335 umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
336 umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
337 umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
338 umax(aM1.u8[8+0], aM2.u8[8+0]), umax(aM1.u8[8+1], aM2.u8[8+1]),
339 umax(aM1.u8[8+2], aM2.u8[8+2]), umax(aM1.u8[8+3], aM2.u8[8+3]),
340 umax(aM1.u8[8+4], aM2.u8[8+4]), umax(aM1.u8[8+5], aM2.u8[8+5]),
341 umax(aM1.u8[8+6], aM2.u8[8+6]), umax(aM1.u8[8+7], aM2.u8[8+7]));
344 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2)
346 return From32<Scalari32x4_t>(umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
347 umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
350 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2)
352 return From32<Scalari32x4_t>(umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
353 umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
356 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2)
358 return FromU16<Scalaru16x8_t>(uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
359 uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
360 uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
361 uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
364 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
365 Scalari16x8_t aFactorsA2B2,
366 Scalari32x4_t& aProductA,
367 Scalari32x4_t& aProductB)
369 aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
370 aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
371 aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
372 aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
373 aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
374 aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
375 aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
376 aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
379 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
380 Scalari16x8_t aFactorsB)
382 return From32<Scalari32x4_t>(aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
383 aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
384 aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
385 aFactorsA.i16[6] * aFactorsB.i16[6] + aFactorsA.i16[7] * aFactorsB.i16[7]);
388 template<int8_t aIndex>
389 inline void AssertIndex()
391 static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
392 "Invalid splat index");
395 template<int8_t aIndex>
396 inline Scalari32x4_t Splat32(Scalari32x4_t aM)
398 AssertIndex<aIndex>();
399 return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex],
400 aM.i32[aIndex], aM.i32[aIndex]);
403 template<int8_t i>
404 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM)
406 AssertIndex<i>();
407 return From8<Scalaru8x16_t>(aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
408 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
409 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3],
410 aM.u8[i*4], aM.u8[i*4+1], aM.u8[i*4+2], aM.u8[i*4+3]);
413 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
414 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM)
416 AssertIndex<i0>();
417 AssertIndex<i1>();
418 AssertIndex<i2>();
419 AssertIndex<i3>();
420 Scalari32x4_t m = aM;
421 m.i32[0] = aM.i32[i3];
422 m.i32[1] = aM.i32[i2];
423 m.i32[2] = aM.i32[i1];
424 m.i32[3] = aM.i32[i0];
425 return m;
428 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
429 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM)
431 AssertIndex<i0>();
432 AssertIndex<i1>();
433 AssertIndex<i2>();
434 AssertIndex<i3>();
435 Scalari16x8_t m = aM;
436 m.i16[0] = aM.i16[i3];
437 m.i16[1] = aM.i16[i2];
438 m.i16[2] = aM.i16[i1];
439 m.i16[3] = aM.i16[i0];
440 return m;
443 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
444 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM)
446 AssertIndex<i0>();
447 AssertIndex<i1>();
448 AssertIndex<i2>();
449 AssertIndex<i3>();
450 Scalari16x8_t m = aM;
451 m.i16[4 + 0] = aM.i16[4 + i3];
452 m.i16[4 + 1] = aM.i16[4 + i2];
453 m.i16[4 + 2] = aM.i16[4 + i1];
454 m.i16[4 + 3] = aM.i16[4 + i0];
455 return m;
458 template<int8_t aIndexLo, int8_t aIndexHi>
459 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM)
461 AssertIndex<aIndexLo>();
462 AssertIndex<aIndexHi>();
463 Scalaru16x8_t m;
464 int16_t chosenValueLo = aM.u16[aIndexLo];
465 m.u16[0] = chosenValueLo;
466 m.u16[1] = chosenValueLo;
467 m.u16[2] = chosenValueLo;
468 m.u16[3] = chosenValueLo;
469 int16_t chosenValueHi = aM.u16[4 + aIndexHi];
470 m.u16[4] = chosenValueHi;
471 m.u16[5] = chosenValueHi;
472 m.u16[6] = chosenValueHi;
473 m.u16[7] = chosenValueHi;
474 return m;
477 inline Scalaru8x16_t
478 InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2)
480 return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1],
481 m1.u8[2], m2.u8[2], m1.u8[3], m2.u8[3],
482 m1.u8[4], m2.u8[4], m1.u8[5], m2.u8[5],
483 m1.u8[6], m2.u8[6], m1.u8[7], m2.u8[7]);
486 inline Scalaru8x16_t
487 InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2)
489 return From8<Scalaru8x16_t>(m1.u8[8+0], m2.u8[8+0], m1.u8[8+1], m2.u8[8+1],
490 m1.u8[8+2], m2.u8[8+2], m1.u8[8+3], m2.u8[8+3],
491 m1.u8[8+4], m2.u8[8+4], m1.u8[8+5], m2.u8[8+5],
492 m1.u8[8+6], m2.u8[8+6], m1.u8[8+7], m2.u8[8+7]);
495 inline Scalaru16x8_t
496 InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2)
498 return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
499 m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
502 inline Scalaru16x8_t
503 InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2)
505 return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
506 m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
509 inline Scalari32x4_t
510 InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2)
512 return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
515 inline Scalari16x8_t
516 UnpackLo8x8ToI16x8(Scalaru8x16_t aM)
518 Scalari16x8_t m;
519 m.i16[0] = aM.u8[0];
520 m.i16[1] = aM.u8[1];
521 m.i16[2] = aM.u8[2];
522 m.i16[3] = aM.u8[3];
523 m.i16[4] = aM.u8[4];
524 m.i16[5] = aM.u8[5];
525 m.i16[6] = aM.u8[6];
526 m.i16[7] = aM.u8[7];
527 return m;
530 inline Scalari16x8_t
531 UnpackHi8x8ToI16x8(Scalaru8x16_t aM)
533 Scalari16x8_t m;
534 m.i16[0] = aM.u8[8+0];
535 m.i16[1] = aM.u8[8+1];
536 m.i16[2] = aM.u8[8+2];
537 m.i16[3] = aM.u8[8+3];
538 m.i16[4] = aM.u8[8+4];
539 m.i16[5] = aM.u8[8+5];
540 m.i16[6] = aM.u8[8+6];
541 m.i16[7] = aM.u8[8+7];
542 return m;
545 inline Scalaru16x8_t
546 UnpackLo8x8ToU16x8(Scalaru8x16_t aM)
548 return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
549 uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
552 inline Scalaru16x8_t
553 UnpackHi8x8ToU16x8(Scalaru8x16_t aM)
555 return FromU16<Scalaru16x8_t>(aM.u8[8+0], aM.u8[8+1], aM.u8[8+2], aM.u8[8+3],
556 aM.u8[8+4], aM.u8[8+5], aM.u8[8+6], aM.u8[8+7]);
559 template<uint8_t aNumBytes>
560 inline Scalaru8x16_t
561 Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678)
563 Scalaru8x16_t m;
564 for (uint8_t i = 0; i < 16; i++) {
565 uint8_t sourceByte = i + aNumBytes;
566 m.u8[i] = sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
568 return m;
571 template<typename T>
572 inline int16_t
573 SaturateTo16(T a)
575 return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
578 inline Scalari16x8_t
579 PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2)
581 Scalari16x8_t m;
582 m.i16[0] = SaturateTo16(m1.i32[0]);
583 m.i16[1] = SaturateTo16(m1.i32[1]);
584 m.i16[2] = SaturateTo16(m1.i32[2]);
585 m.i16[3] = SaturateTo16(m1.i32[3]);
586 m.i16[4] = SaturateTo16(m2.i32[0]);
587 m.i16[5] = SaturateTo16(m2.i32[1]);
588 m.i16[6] = SaturateTo16(m2.i32[2]);
589 m.i16[7] = SaturateTo16(m2.i32[3]);
590 return m;
593 template<typename T>
594 inline uint16_t
595 SaturateToU16(T a)
597 return uint16_t(umin(a & -(a >= 0), INT16_MAX));
600 inline Scalaru16x8_t
601 PackAndSaturate32ToU16(Scalari32x4_t m1, Scalari32x4_t m2)
603 Scalaru16x8_t m;
604 m.u16[0] = SaturateToU16(m1.i32[0]);
605 m.u16[1] = SaturateToU16(m1.i32[1]);
606 m.u16[2] = SaturateToU16(m1.i32[2]);
607 m.u16[3] = SaturateToU16(m1.i32[3]);
608 m.u16[4] = SaturateToU16(m2.i32[0]);
609 m.u16[5] = SaturateToU16(m2.i32[1]);
610 m.u16[6] = SaturateToU16(m2.i32[2]);
611 m.u16[7] = SaturateToU16(m2.i32[3]);
612 return m;
615 template<typename T>
616 inline uint8_t
617 SaturateTo8(T a)
619 return uint8_t(umin(a & -(a >= 0), 255));
622 inline Scalaru8x16_t
623 PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, Scalari32x4_t m3, const Scalari32x4_t& m4)
625 Scalaru8x16_t m;
626 m.u8[0] = SaturateTo8(m1.i32[0]);
627 m.u8[1] = SaturateTo8(m1.i32[1]);
628 m.u8[2] = SaturateTo8(m1.i32[2]);
629 m.u8[3] = SaturateTo8(m1.i32[3]);
630 m.u8[4] = SaturateTo8(m2.i32[0]);
631 m.u8[5] = SaturateTo8(m2.i32[1]);
632 m.u8[6] = SaturateTo8(m2.i32[2]);
633 m.u8[7] = SaturateTo8(m2.i32[3]);
634 m.u8[8] = SaturateTo8(m3.i32[0]);
635 m.u8[9] = SaturateTo8(m3.i32[1]);
636 m.u8[10] = SaturateTo8(m3.i32[2]);
637 m.u8[11] = SaturateTo8(m3.i32[3]);
638 m.u8[12] = SaturateTo8(m4.i32[0]);
639 m.u8[13] = SaturateTo8(m4.i32[1]);
640 m.u8[14] = SaturateTo8(m4.i32[2]);
641 m.u8[15] = SaturateTo8(m4.i32[3]);
642 return m;
645 inline Scalaru8x16_t
646 PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2)
648 Scalaru8x16_t m;
649 m.u8[0] = SaturateTo8(m1.i16[0]);
650 m.u8[1] = SaturateTo8(m1.i16[1]);
651 m.u8[2] = SaturateTo8(m1.i16[2]);
652 m.u8[3] = SaturateTo8(m1.i16[3]);
653 m.u8[4] = SaturateTo8(m1.i16[4]);
654 m.u8[5] = SaturateTo8(m1.i16[5]);
655 m.u8[6] = SaturateTo8(m1.i16[6]);
656 m.u8[7] = SaturateTo8(m1.i16[7]);
657 m.u8[8] = SaturateTo8(m2.i16[0]);
658 m.u8[9] = SaturateTo8(m2.i16[1]);
659 m.u8[10] = SaturateTo8(m2.i16[2]);
660 m.u8[11] = SaturateTo8(m2.i16[3]);
661 m.u8[12] = SaturateTo8(m2.i16[4]);
662 m.u8[13] = SaturateTo8(m2.i16[5]);
663 m.u8[14] = SaturateTo8(m2.i16[6]);
664 m.u8[15] = SaturateTo8(m2.i16[7]);
665 return m;
668 // Fast approximate division by 255. It has the property that
669 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
670 // But it only uses two adds and two shifts instead of an
671 // integer division (which is expensive on many processors).
673 // equivalent to v/255
674 template<class B, class A>
675 inline B FastDivideBy255(A v)
677 return ((v << 8) + v + 255) >> 16;
680 inline Scalaru16x8_t
681 FastDivideBy255_16(Scalaru16x8_t m)
683 return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
684 FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
685 FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
686 FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
687 FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
688 FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
689 FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
690 FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
693 inline Scalari32x4_t
694 FastDivideBy255(Scalari32x4_t m)
696 return From32<Scalari32x4_t>(FastDivideBy255<int32_t>(m.i32[0]),
697 FastDivideBy255<int32_t>(m.i32[1]),
698 FastDivideBy255<int32_t>(m.i32[2]),
699 FastDivideBy255<int32_t>(m.i32[3]));
702 inline Scalaru8x16_t
703 Pick(Scalaru8x16_t mask, Scalaru8x16_t a, Scalaru8x16_t b)
705 return From8<Scalaru8x16_t>((a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
706 (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
707 (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
708 (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
709 (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
710 (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
711 (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
712 (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
713 (a.u8[8+0] & (~mask.u8[8+0])) | (b.u8[8+0] & mask.u8[8+0]),
714 (a.u8[8+1] & (~mask.u8[8+1])) | (b.u8[8+1] & mask.u8[8+1]),
715 (a.u8[8+2] & (~mask.u8[8+2])) | (b.u8[8+2] & mask.u8[8+2]),
716 (a.u8[8+3] & (~mask.u8[8+3])) | (b.u8[8+3] & mask.u8[8+3]),
717 (a.u8[8+4] & (~mask.u8[8+4])) | (b.u8[8+4] & mask.u8[8+4]),
718 (a.u8[8+5] & (~mask.u8[8+5])) | (b.u8[8+5] & mask.u8[8+5]),
719 (a.u8[8+6] & (~mask.u8[8+6])) | (b.u8[8+6] & mask.u8[8+6]),
720 (a.u8[8+7] & (~mask.u8[8+7])) | (b.u8[8+7] & mask.u8[8+7]));
723 inline Scalari32x4_t
724 Pick(Scalari32x4_t mask, Scalari32x4_t a, Scalari32x4_t b)
726 return From32<Scalari32x4_t>((a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
727 (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
728 (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
729 (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
732 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t)
734 return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
735 a.f32[1] + (b.f32[1] - a.f32[1]) * t,
736 a.f32[2] + (b.f32[2] - a.f32[2]) * t,
737 a.f32[3] + (b.f32[3] - a.f32[3]) * t);
740 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, float wb)
742 return FromF32<Scalarf32x4_t>(a.f32[0] * wa + b.f32[0] * wb,
743 a.f32[1] * wa + b.f32[1] * wb,
744 a.f32[2] * wa + b.f32[2] * wb,
745 a.f32[3] * wa + b.f32[3] * wb);
748 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a)
750 return FromF32<Scalarf32x4_t>(fabs(a.f32[0]),
751 fabs(a.f32[1]),
752 fabs(a.f32[2]),
753 fabs(a.f32[3]));
756 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b)
758 return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0],
759 a.f32[1] + b.f32[1],
760 a.f32[2] + b.f32[2],
761 a.f32[3] + b.f32[3]);
764 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b)
766 return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0],
767 a.f32[1] * b.f32[1],
768 a.f32[2] * b.f32[2],
769 a.f32[3] * b.f32[3]);
772 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b)
774 return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0],
775 a.f32[1] / b.f32[1],
776 a.f32[2] / b.f32[2],
777 a.f32[3] / b.f32[3]);
780 template<uint8_t aIndex>
781 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m)
783 AssertIndex<aIndex>();
784 return FromF32<Scalarf32x4_t>(m.f32[aIndex],
785 m.f32[aIndex],
786 m.f32[aIndex],
787 m.f32[aIndex]);
790 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m)
792 return From32<Scalari32x4_t>(int32_t(floor(m.f32[0] + 0.5f)),
793 int32_t(floor(m.f32[1] + 0.5f)),
794 int32_t(floor(m.f32[2] + 0.5f)),
795 int32_t(floor(m.f32[3] + 0.5f)));
798 #ifdef SIMD_COMPILE_SSE2
800 // SSE2
802 template<>
803 inline __m128i
804 Load8<__m128i>(const uint8_t* aSource)
806 return _mm_load_si128((const __m128i*)aSource);
809 inline void Store8(uint8_t* aTarget, __m128i aM)
811 _mm_store_si128((__m128i*)aTarget, aM);
814 template<>
815 inline __m128i FromZero8<__m128i>()
817 return _mm_setzero_si128();
820 template<>
821 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
822 uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p)
824 return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
825 (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
828 template<>
829 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, int16_t f, int16_t g, int16_t h)
831 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
834 template<>
835 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h)
837 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
840 template<>
841 inline __m128i FromI16<__m128i>(int16_t a)
843 return _mm_set1_epi16(a);
846 template<>
847 inline __m128i FromU16<__m128i>(uint16_t a)
849 return _mm_set1_epi16((int16_t)a);
852 template<>
853 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d)
855 return _mm_setr_epi32(a, b, c, d);
858 template<>
859 inline __m128i From32<__m128i>(int32_t a)
861 return _mm_set1_epi32(a);
864 template<>
865 inline __m128 FromF32<__m128>(float a, float b, float c, float d)
867 return _mm_setr_ps(a, b, c, d);
870 template<>
871 inline __m128 FromF32<__m128>(float a)
873 return _mm_set1_ps(a);
876 template<int32_t aNumberOfBits>
877 inline __m128i ShiftRight16(__m128i aM)
879 return _mm_srli_epi16(aM, aNumberOfBits);
882 template<int32_t aNumberOfBits>
883 inline __m128i ShiftRight32(__m128i aM)
885 return _mm_srai_epi32(aM, aNumberOfBits);
888 inline __m128i Add16(__m128i aM1, __m128i aM2)
890 return _mm_add_epi16(aM1, aM2);
893 inline __m128i Add32(__m128i aM1, __m128i aM2)
895 return _mm_add_epi32(aM1, aM2);
898 inline __m128i Sub16(__m128i aM1, __m128i aM2)
900 return _mm_sub_epi16(aM1, aM2);
903 inline __m128i Sub32(__m128i aM1, __m128i aM2)
905 return _mm_sub_epi32(aM1, aM2);
908 inline __m128i Min8(__m128i aM1, __m128i aM2)
910 return _mm_min_epu8(aM1, aM2);
913 inline __m128i Max8(__m128i aM1, __m128i aM2)
915 return _mm_max_epu8(aM1, aM2);
918 inline __m128i Min32(__m128i aM1, __m128i aM2)
920 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
921 __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
922 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
925 inline __m128i Max32(__m128i aM1, __m128i aM2)
927 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
928 __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
929 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
932 inline __m128i Mul16(__m128i aM1, __m128i aM2)
934 return _mm_mullo_epi16(aM1, aM2);
937 inline __m128i MulU16(__m128i aM1, __m128i aM2)
939 return _mm_mullo_epi16(aM1, aM2);
942 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1,
943 __m128i aFactorsA2B2,
944 __m128i& aProductA,
945 __m128i& aProductB)
947 __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
948 __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
949 aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
950 aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
953 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA,
954 __m128i aFactorsB)
956 return _mm_madd_epi16(aFactorsA, aFactorsB);
959 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
960 inline __m128i Shuffle32(__m128i aM)
962 AssertIndex<i0>();
963 AssertIndex<i1>();
964 AssertIndex<i2>();
965 AssertIndex<i3>();
966 return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
969 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
970 inline __m128i ShuffleLo16(__m128i aM)
972 AssertIndex<i0>();
973 AssertIndex<i1>();
974 AssertIndex<i2>();
975 AssertIndex<i3>();
976 return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
979 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3>
980 inline __m128i ShuffleHi16(__m128i aM)
982 AssertIndex<i0>();
983 AssertIndex<i1>();
984 AssertIndex<i2>();
985 AssertIndex<i3>();
986 return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
989 template<int8_t aIndex>
990 inline __m128i Splat32(__m128i aM)
992 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
995 template<int8_t aIndex>
996 inline __m128i Splat32On8(__m128i aM)
998 return Shuffle32<aIndex,aIndex,aIndex,aIndex>(aM);
1001 template<int8_t aIndexLo, int8_t aIndexHi>
1002 inline __m128i Splat16(__m128i aM)
1004 AssertIndex<aIndexLo>();
1005 AssertIndex<aIndexHi>();
1006 return ShuffleHi16<aIndexHi,aIndexHi,aIndexHi,aIndexHi>(
1007 ShuffleLo16<aIndexLo,aIndexLo,aIndexLo,aIndexLo>(aM));
1010 inline __m128i
1011 UnpackLo8x8ToI16x8(__m128i m)
1013 __m128i zero = _mm_set1_epi8(0);
1014 return _mm_unpacklo_epi8(m, zero);
1017 inline __m128i
1018 UnpackHi8x8ToI16x8(__m128i m)
1020 __m128i zero = _mm_set1_epi8(0);
1021 return _mm_unpackhi_epi8(m, zero);
1024 inline __m128i
1025 UnpackLo8x8ToU16x8(__m128i m)
1027 __m128i zero = _mm_set1_epi8(0);
1028 return _mm_unpacklo_epi8(m, zero);
1031 inline __m128i
1032 UnpackHi8x8ToU16x8(__m128i m)
1034 __m128i zero = _mm_set1_epi8(0);
1035 return _mm_unpackhi_epi8(m, zero);
1038 inline __m128i
1039 InterleaveLo8(__m128i m1, __m128i m2)
1041 return _mm_unpacklo_epi8(m1, m2);
1044 inline __m128i
1045 InterleaveHi8(__m128i m1, __m128i m2)
1047 return _mm_unpackhi_epi8(m1, m2);
1050 inline __m128i
1051 InterleaveLo16(__m128i m1, __m128i m2)
1053 return _mm_unpacklo_epi16(m1, m2);
1056 inline __m128i
1057 InterleaveHi16(__m128i m1, __m128i m2)
1059 return _mm_unpackhi_epi16(m1, m2);
1062 inline __m128i
1063 InterleaveLo32(__m128i m1, __m128i m2)
1065 return _mm_unpacklo_epi32(m1, m2);
1068 template<uint8_t aNumBytes>
1069 inline __m128i
1070 Rotate8(__m128i a1234, __m128i a5678)
1072 return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), _mm_slli_si128(a5678, 16 - aNumBytes));
1075 inline __m128i
1076 PackAndSaturate32To16(__m128i m1, __m128i m2)
1078 return _mm_packs_epi32(m1, m2);
1081 inline __m128i
1082 PackAndSaturate32ToU16(__m128i m1, __m128i m2)
1084 return _mm_packs_epi32(m1, m2);
1087 inline __m128i
1088 PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, const __m128i& m4)
1090 // Pack into 8 16bit signed integers (saturating).
1091 __m128i m12 = _mm_packs_epi32(m1, m2);
1092 __m128i m34 = _mm_packs_epi32(m3, m4);
1094 // Pack into 16 8bit unsigned integers (saturating).
1095 return _mm_packus_epi16(m12, m34);
1098 inline __m128i
1099 PackAndSaturate16To8(__m128i m1, __m128i m2)
1101 // Pack into 16 8bit unsigned integers (saturating).
1102 return _mm_packus_epi16(m1, m2);
1105 inline __m128i
1106 FastDivideBy255(__m128i m)
1108 // v = m << 8
1109 __m128i v = _mm_slli_epi32(m, 8);
1110 // v = v + (m + (255,255,255,255))
1111 v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
1112 // v = v >> 16
1113 return _mm_srai_epi32(v, 16);
1116 inline __m128i
1117 FastDivideBy255_16(__m128i m)
1119 __m128i zero = _mm_set1_epi16(0);
1120 __m128i lo = _mm_unpacklo_epi16(m, zero);
1121 __m128i hi = _mm_unpackhi_epi16(m, zero);
1122 return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
1125 inline __m128i
1126 Pick(__m128i mask, __m128i a, __m128i b)
1128 return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
1131 inline __m128 MixF32(__m128 a, __m128 b, float t)
1133 return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
1136 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb)
1138 return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), _mm_mul_ps(b, _mm_set1_ps(wb)));
1141 inline __m128 AbsF32(__m128 a)
1143 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
1146 inline __m128 AddF32(__m128 a, __m128 b)
1148 return _mm_add_ps(a, b);
1151 inline __m128 MulF32(__m128 a, __m128 b)
1153 return _mm_mul_ps(a, b);
1156 inline __m128 DivF32(__m128 a, __m128 b)
1158 return _mm_div_ps(a, b);
1161 template<uint8_t aIndex>
1162 inline __m128 SplatF32(__m128 m)
1164 AssertIndex<aIndex>();
1165 return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
1168 inline __m128i F32ToI32(__m128 m)
1170 return _mm_cvtps_epi32(m);
1173 #endif // SIMD_COMPILE_SSE2
1175 } // namespace simd
1177 } // namespace gfx
1178 } // namespace mozilla
1180 #endif // _MOZILLA_GFX_SIMD_H_