no bug - Bumping Firefox l10n changesets r=release a=l10n-bump DONTBUILD CLOSED TREE
[gecko.git] / gfx / 2d / SIMD.h
blob80aca407b4c8cec2736676c75f6d1d17c14ff060
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #ifndef _MOZILLA_GFX_SIMD_H_
8 #define _MOZILLA_GFX_SIMD_H_
10 /**
11 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
12 * if they want access to the SSE2 functions.
15 #ifdef SIMD_COMPILE_SSE2
16 # include <xmmintrin.h>
17 #endif
19 namespace mozilla {
20 namespace gfx {
22 namespace simd {
24 template <typename u8x16_t>
25 u8x16_t Load8(const uint8_t* aSource);
27 template <typename u8x16_t>
28 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f,
29 uint8_t g, uint8_t h, uint8_t i, uint8_t j, uint8_t k, uint8_t l,
30 uint8_t m, uint8_t n, uint8_t o, uint8_t p);
32 template <typename u8x16_t>
33 u8x16_t FromZero8();
35 template <typename i16x8_t>
36 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e,
37 int16_t f, int16_t g, int16_t h);
39 template <typename u16x8_t>
40 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e,
41 uint16_t f, uint16_t g, uint16_t h);
43 template <typename i16x8_t>
44 i16x8_t FromI16(int16_t a);
46 template <typename u16x8_t>
47 u16x8_t FromU16(uint16_t a);
49 template <typename i32x4_t>
50 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
52 template <typename i32x4_t>
53 i32x4_t From32(int32_t a);
55 template <typename f32x4_t>
56 f32x4_t FromF32(float a, float b, float c, float d);
58 template <typename f32x4_t>
59 f32x4_t FromF32(float a);
61 // All SIMD backends overload these functions for their SIMD types:
63 #if 0
65 // Store 16 bytes to a 16-byte aligned address
66 void Store8(uint8_t* aTarget, u8x16_t aM);
68 // Fixed shifts
69 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
70 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
72 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
73 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
74 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
75 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
76 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
77 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
78 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
79 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
81 // Truncating i16 -> i16 multiplication
82 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
84 // Long multiplication i16 -> i32
85 // aFactorsA1B1 = (a1[4] b1[4])
86 // aFactorsA2B2 = (a2[4] b2[4])
87 // aProductA = a1 * a2, aProductB = b1 * b2
88 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
89 i32x4_t& aProductA, i32x4_t& aProductB);
91 // Long multiplication + pairwise addition i16 -> i32
92 // See the scalar implementation for specifics.
93 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
94 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
96 // Set all four 32-bit components to the value of the component at aIndex.
97 template<int8_t aIndex>
98 i32x4_t Splat32(i32x4_t aM);
100 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
101 // re-interpret the result as sixteen 8-bit values.
102 template<int8_t aIndex>
103 u8x16_t Splat32On8(u8x16_t aM);
105 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
106 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
107 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
109 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
110 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
111 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
112 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
113 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
115 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
116 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
117 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
118 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
120 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
121 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
122 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
124 i32x4 FastDivideBy255(i32x4 m);
125 i16x8 FastDivideBy255_16(i16x8 m);
127 #endif
129 // Scalar
131 struct Scalaru8x16_t {
132 uint8_t u8[16];
135 union Scalari16x8_t {
136 int16_t i16[8];
137 uint16_t u16[8];
140 typedef Scalari16x8_t Scalaru16x8_t;
142 struct Scalari32x4_t {
143 int32_t i32[4];
146 struct Scalarf32x4_t {
147 float f32[4];
150 template <>
151 inline Scalaru8x16_t Load8<Scalaru8x16_t>(const uint8_t* aSource) {
152 return *(Scalaru8x16_t*)aSource;
155 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) {
156 *(Scalaru8x16_t*)aTarget = aM;
159 template <>
160 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c,
161 uint8_t d, uint8_t e, uint8_t f,
162 uint8_t g, uint8_t h, uint8_t i,
163 uint8_t j, uint8_t k, uint8_t l,
164 uint8_t m, uint8_t n, uint8_t o,
165 uint8_t p) {
166 Scalaru8x16_t _m;
167 _m.u8[0] = a;
168 _m.u8[1] = b;
169 _m.u8[2] = c;
170 _m.u8[3] = d;
171 _m.u8[4] = e;
172 _m.u8[5] = f;
173 _m.u8[6] = g;
174 _m.u8[7] = h;
175 _m.u8[8 + 0] = i;
176 _m.u8[8 + 1] = j;
177 _m.u8[8 + 2] = k;
178 _m.u8[8 + 3] = l;
179 _m.u8[8 + 4] = m;
180 _m.u8[8 + 5] = n;
181 _m.u8[8 + 6] = o;
182 _m.u8[8 + 7] = p;
183 return _m;
186 template <>
187 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() {
188 return From8<Scalaru8x16_t>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
191 template <>
192 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c,
193 int16_t d, int16_t e, int16_t f,
194 int16_t g, int16_t h) {
195 Scalari16x8_t m;
196 m.i16[0] = a;
197 m.i16[1] = b;
198 m.i16[2] = c;
199 m.i16[3] = d;
200 m.i16[4] = e;
201 m.i16[5] = f;
202 m.i16[6] = g;
203 m.i16[7] = h;
204 return m;
207 template <>
208 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c,
209 uint16_t d, uint16_t e, uint16_t f,
210 uint16_t g, uint16_t h) {
211 Scalaru16x8_t m;
212 m.u16[0] = a;
213 m.u16[1] = b;
214 m.u16[2] = c;
215 m.u16[3] = d;
216 m.u16[4] = e;
217 m.u16[5] = f;
218 m.u16[6] = g;
219 m.u16[7] = h;
220 return m;
223 template <>
224 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) {
225 return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
228 template <>
229 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) {
230 return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
233 template <>
234 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c,
235 int32_t d) {
236 Scalari32x4_t m;
237 m.i32[0] = a;
238 m.i32[1] = b;
239 m.i32[2] = c;
240 m.i32[3] = d;
241 return m;
244 template <>
245 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c,
246 float d) {
247 Scalarf32x4_t m;
248 m.f32[0] = a;
249 m.f32[1] = b;
250 m.f32[2] = c;
251 m.f32[3] = d;
252 return m;
255 template <>
256 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) {
257 return FromF32<Scalarf32x4_t>(a, a, a, a);
260 template <>
261 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) {
262 return From32<Scalari32x4_t>(a, a, a, a);
265 template <int32_t aNumberOfBits>
266 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) {
267 return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits,
268 uint16_t(aM.i16[1]) >> aNumberOfBits,
269 uint16_t(aM.i16[2]) >> aNumberOfBits,
270 uint16_t(aM.i16[3]) >> aNumberOfBits,
271 uint16_t(aM.i16[4]) >> aNumberOfBits,
272 uint16_t(aM.i16[5]) >> aNumberOfBits,
273 uint16_t(aM.i16[6]) >> aNumberOfBits,
274 uint16_t(aM.i16[7]) >> aNumberOfBits);
277 template <int32_t aNumberOfBits>
278 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) {
279 return From32<Scalari32x4_t>(
280 aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
281 aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
284 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
285 return FromU16<Scalaru16x8_t>(
286 aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], aM1.u16[2] + aM2.u16[2],
287 aM1.u16[3] + aM2.u16[3], aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
288 aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
291 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
292 return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
293 aM1.i32[2] + aM2.i32[2],
294 aM1.i32[3] + aM2.i32[3]);
297 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
298 return FromU16<Scalaru16x8_t>(
299 aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], aM1.u16[2] - aM2.u16[2],
300 aM1.u16[3] - aM2.u16[3], aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
301 aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
304 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
305 return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
306 aM1.i32[2] - aM2.i32[2],
307 aM1.i32[3] - aM2.i32[3]);
310 inline int32_t umin(int32_t a, int32_t b) { return a - ((a - b) & -(a > b)); }
312 inline int32_t umax(int32_t a, int32_t b) { return a - ((a - b) & -(a < b)); }
314 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) {
315 return From8<Scalaru8x16_t>(
316 umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
317 umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
318 umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
319 umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
320 umin(aM1.u8[8 + 0], aM2.u8[8 + 0]), umin(aM1.u8[8 + 1], aM2.u8[8 + 1]),
321 umin(aM1.u8[8 + 2], aM2.u8[8 + 2]), umin(aM1.u8[8 + 3], aM2.u8[8 + 3]),
322 umin(aM1.u8[8 + 4], aM2.u8[8 + 4]), umin(aM1.u8[8 + 5], aM2.u8[8 + 5]),
323 umin(aM1.u8[8 + 6], aM2.u8[8 + 6]), umin(aM1.u8[8 + 7], aM2.u8[8 + 7]));
326 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) {
327 return From8<Scalaru8x16_t>(
328 umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
329 umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
330 umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
331 umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
332 umax(aM1.u8[8 + 0], aM2.u8[8 + 0]), umax(aM1.u8[8 + 1], aM2.u8[8 + 1]),
333 umax(aM1.u8[8 + 2], aM2.u8[8 + 2]), umax(aM1.u8[8 + 3], aM2.u8[8 + 3]),
334 umax(aM1.u8[8 + 4], aM2.u8[8 + 4]), umax(aM1.u8[8 + 5], aM2.u8[8 + 5]),
335 umax(aM1.u8[8 + 6], aM2.u8[8 + 6]), umax(aM1.u8[8 + 7], aM2.u8[8 + 7]));
338 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
339 return From32<Scalari32x4_t>(
340 umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
341 umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
344 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
345 return From32<Scalari32x4_t>(
346 umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
347 umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
350 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
351 return FromU16<Scalaru16x8_t>(
352 uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])),
353 uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
354 uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])),
355 uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
356 uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])),
357 uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
358 uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])),
359 uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
362 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
363 Scalari16x8_t aFactorsA2B2,
364 Scalari32x4_t& aProductA,
365 Scalari32x4_t& aProductB) {
366 aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
367 aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
368 aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
369 aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
370 aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
371 aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
372 aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
373 aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
376 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
377 Scalari16x8_t aFactorsB) {
378 return From32<Scalari32x4_t>(
379 aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
380 aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
381 aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
382 aFactorsA.i16[6] * aFactorsB.i16[6] +
383 aFactorsA.i16[7] * aFactorsB.i16[7]);
386 template <int8_t aIndex>
387 inline void AssertIndex() {
388 static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
389 "Invalid splat index");
392 template <int8_t aIndex>
393 inline Scalari32x4_t Splat32(Scalari32x4_t aM) {
394 AssertIndex<aIndex>();
395 return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], aM.i32[aIndex],
396 aM.i32[aIndex]);
399 template <int8_t i>
400 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) {
401 AssertIndex<i>();
402 return From8<Scalaru8x16_t>(
403 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
404 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
405 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
406 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3]);
409 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
410 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) {
411 AssertIndex<i0>();
412 AssertIndex<i1>();
413 AssertIndex<i2>();
414 AssertIndex<i3>();
415 Scalari32x4_t m = aM;
416 m.i32[0] = aM.i32[i3];
417 m.i32[1] = aM.i32[i2];
418 m.i32[2] = aM.i32[i1];
419 m.i32[3] = aM.i32[i0];
420 return m;
423 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
424 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) {
425 AssertIndex<i0>();
426 AssertIndex<i1>();
427 AssertIndex<i2>();
428 AssertIndex<i3>();
429 Scalari16x8_t m = aM;
430 m.i16[0] = aM.i16[i3];
431 m.i16[1] = aM.i16[i2];
432 m.i16[2] = aM.i16[i1];
433 m.i16[3] = aM.i16[i0];
434 return m;
437 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
438 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) {
439 AssertIndex<i0>();
440 AssertIndex<i1>();
441 AssertIndex<i2>();
442 AssertIndex<i3>();
443 Scalari16x8_t m = aM;
444 m.i16[4 + 0] = aM.i16[4 + i3];
445 m.i16[4 + 1] = aM.i16[4 + i2];
446 m.i16[4 + 2] = aM.i16[4 + i1];
447 m.i16[4 + 3] = aM.i16[4 + i0];
448 return m;
451 template <int8_t aIndexLo, int8_t aIndexHi>
452 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) {
453 AssertIndex<aIndexLo>();
454 AssertIndex<aIndexHi>();
455 Scalaru16x8_t m;
456 int16_t chosenValueLo = aM.u16[aIndexLo];
457 m.u16[0] = chosenValueLo;
458 m.u16[1] = chosenValueLo;
459 m.u16[2] = chosenValueLo;
460 m.u16[3] = chosenValueLo;
461 int16_t chosenValueHi = aM.u16[4 + aIndexHi];
462 m.u16[4] = chosenValueHi;
463 m.u16[5] = chosenValueHi;
464 m.u16[6] = chosenValueHi;
465 m.u16[7] = chosenValueHi;
466 return m;
469 inline Scalaru8x16_t InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) {
470 return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], m1.u8[2],
471 m2.u8[2], m1.u8[3], m2.u8[3], m1.u8[4], m2.u8[4],
472 m1.u8[5], m2.u8[5], m1.u8[6], m2.u8[6], m1.u8[7],
473 m2.u8[7]);
476 inline Scalaru8x16_t InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) {
477 return From8<Scalaru8x16_t>(
478 m1.u8[8 + 0], m2.u8[8 + 0], m1.u8[8 + 1], m2.u8[8 + 1], m1.u8[8 + 2],
479 m2.u8[8 + 2], m1.u8[8 + 3], m2.u8[8 + 3], m1.u8[8 + 4], m2.u8[8 + 4],
480 m1.u8[8 + 5], m2.u8[8 + 5], m1.u8[8 + 6], m2.u8[8 + 6], m1.u8[8 + 7],
481 m2.u8[8 + 7]);
484 inline Scalaru16x8_t InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) {
485 return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
486 m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
489 inline Scalaru16x8_t InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) {
490 return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
491 m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
494 inline Scalari32x4_t InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) {
495 return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
498 inline Scalari16x8_t UnpackLo8x8ToI16x8(Scalaru8x16_t aM) {
499 Scalari16x8_t m;
500 m.i16[0] = aM.u8[0];
501 m.i16[1] = aM.u8[1];
502 m.i16[2] = aM.u8[2];
503 m.i16[3] = aM.u8[3];
504 m.i16[4] = aM.u8[4];
505 m.i16[5] = aM.u8[5];
506 m.i16[6] = aM.u8[6];
507 m.i16[7] = aM.u8[7];
508 return m;
511 inline Scalari16x8_t UnpackHi8x8ToI16x8(Scalaru8x16_t aM) {
512 Scalari16x8_t m;
513 m.i16[0] = aM.u8[8 + 0];
514 m.i16[1] = aM.u8[8 + 1];
515 m.i16[2] = aM.u8[8 + 2];
516 m.i16[3] = aM.u8[8 + 3];
517 m.i16[4] = aM.u8[8 + 4];
518 m.i16[5] = aM.u8[8 + 5];
519 m.i16[6] = aM.u8[8 + 6];
520 m.i16[7] = aM.u8[8 + 7];
521 return m;
524 inline Scalaru16x8_t UnpackLo8x8ToU16x8(Scalaru8x16_t aM) {
525 return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]),
526 uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
527 uint16_t(aM.u8[4]), uint16_t(aM.u8[5]),
528 uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
531 inline Scalaru16x8_t UnpackHi8x8ToU16x8(Scalaru8x16_t aM) {
532 return FromU16<Scalaru16x8_t>(aM.u8[8 + 0], aM.u8[8 + 1], aM.u8[8 + 2],
533 aM.u8[8 + 3], aM.u8[8 + 4], aM.u8[8 + 5],
534 aM.u8[8 + 6], aM.u8[8 + 7]);
537 template <uint8_t aNumBytes>
538 inline Scalaru8x16_t Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) {
539 Scalaru8x16_t m;
540 for (uint8_t i = 0; i < 16; i++) {
541 uint8_t sourceByte = i + aNumBytes;
542 m.u8[i] =
543 sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
545 return m;
548 template <typename T>
549 inline int16_t SaturateTo16(T a) {
550 return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
553 inline Scalari16x8_t PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) {
554 Scalari16x8_t m;
555 m.i16[0] = SaturateTo16(m1.i32[0]);
556 m.i16[1] = SaturateTo16(m1.i32[1]);
557 m.i16[2] = SaturateTo16(m1.i32[2]);
558 m.i16[3] = SaturateTo16(m1.i32[3]);
559 m.i16[4] = SaturateTo16(m2.i32[0]);
560 m.i16[5] = SaturateTo16(m2.i32[1]);
561 m.i16[6] = SaturateTo16(m2.i32[2]);
562 m.i16[7] = SaturateTo16(m2.i32[3]);
563 return m;
566 template <typename T>
567 inline uint16_t SaturateToU16(T a) {
568 return uint16_t(umin(a & -(a >= 0), INT16_MAX));
571 inline Scalaru16x8_t PackAndSaturate32ToU16(Scalari32x4_t m1,
572 Scalari32x4_t m2) {
573 Scalaru16x8_t m;
574 m.u16[0] = SaturateToU16(m1.i32[0]);
575 m.u16[1] = SaturateToU16(m1.i32[1]);
576 m.u16[2] = SaturateToU16(m1.i32[2]);
577 m.u16[3] = SaturateToU16(m1.i32[3]);
578 m.u16[4] = SaturateToU16(m2.i32[0]);
579 m.u16[5] = SaturateToU16(m2.i32[1]);
580 m.u16[6] = SaturateToU16(m2.i32[2]);
581 m.u16[7] = SaturateToU16(m2.i32[3]);
582 return m;
585 template <typename T>
586 inline uint8_t SaturateTo8(T a) {
587 return uint8_t(umin(a & -(a >= 0), 255));
590 inline Scalaru8x16_t PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2,
591 Scalari32x4_t m3,
592 const Scalari32x4_t& m4) {
593 Scalaru8x16_t m;
594 m.u8[0] = SaturateTo8(m1.i32[0]);
595 m.u8[1] = SaturateTo8(m1.i32[1]);
596 m.u8[2] = SaturateTo8(m1.i32[2]);
597 m.u8[3] = SaturateTo8(m1.i32[3]);
598 m.u8[4] = SaturateTo8(m2.i32[0]);
599 m.u8[5] = SaturateTo8(m2.i32[1]);
600 m.u8[6] = SaturateTo8(m2.i32[2]);
601 m.u8[7] = SaturateTo8(m2.i32[3]);
602 m.u8[8] = SaturateTo8(m3.i32[0]);
603 m.u8[9] = SaturateTo8(m3.i32[1]);
604 m.u8[10] = SaturateTo8(m3.i32[2]);
605 m.u8[11] = SaturateTo8(m3.i32[3]);
606 m.u8[12] = SaturateTo8(m4.i32[0]);
607 m.u8[13] = SaturateTo8(m4.i32[1]);
608 m.u8[14] = SaturateTo8(m4.i32[2]);
609 m.u8[15] = SaturateTo8(m4.i32[3]);
610 return m;
613 inline Scalaru8x16_t PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) {
614 Scalaru8x16_t m;
615 m.u8[0] = SaturateTo8(m1.i16[0]);
616 m.u8[1] = SaturateTo8(m1.i16[1]);
617 m.u8[2] = SaturateTo8(m1.i16[2]);
618 m.u8[3] = SaturateTo8(m1.i16[3]);
619 m.u8[4] = SaturateTo8(m1.i16[4]);
620 m.u8[5] = SaturateTo8(m1.i16[5]);
621 m.u8[6] = SaturateTo8(m1.i16[6]);
622 m.u8[7] = SaturateTo8(m1.i16[7]);
623 m.u8[8] = SaturateTo8(m2.i16[0]);
624 m.u8[9] = SaturateTo8(m2.i16[1]);
625 m.u8[10] = SaturateTo8(m2.i16[2]);
626 m.u8[11] = SaturateTo8(m2.i16[3]);
627 m.u8[12] = SaturateTo8(m2.i16[4]);
628 m.u8[13] = SaturateTo8(m2.i16[5]);
629 m.u8[14] = SaturateTo8(m2.i16[6]);
630 m.u8[15] = SaturateTo8(m2.i16[7]);
631 return m;
634 // Fast approximate division by 255. It has the property that
635 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
636 // But it only uses two adds and two shifts instead of an
637 // integer division (which is expensive on many processors).
639 // equivalent to v/255
640 template <class B, class A>
641 inline B FastDivideBy255(A v) {
642 return ((v << 8) + v + 255) >> 16;
645 inline Scalaru16x8_t FastDivideBy255_16(Scalaru16x8_t m) {
646 return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
647 FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
648 FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
649 FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
650 FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
651 FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
652 FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
653 FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
656 inline Scalari32x4_t FastDivideBy255(Scalari32x4_t m) {
657 return From32<Scalari32x4_t>(
658 FastDivideBy255<int32_t>(m.i32[0]), FastDivideBy255<int32_t>(m.i32[1]),
659 FastDivideBy255<int32_t>(m.i32[2]), FastDivideBy255<int32_t>(m.i32[3]));
662 inline Scalaru8x16_t Pick(Scalaru8x16_t mask, Scalaru8x16_t a,
663 Scalaru8x16_t b) {
664 return From8<Scalaru8x16_t>(
665 (a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
666 (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
667 (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
668 (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
669 (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
670 (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
671 (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
672 (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
673 (a.u8[8 + 0] & (~mask.u8[8 + 0])) | (b.u8[8 + 0] & mask.u8[8 + 0]),
674 (a.u8[8 + 1] & (~mask.u8[8 + 1])) | (b.u8[8 + 1] & mask.u8[8 + 1]),
675 (a.u8[8 + 2] & (~mask.u8[8 + 2])) | (b.u8[8 + 2] & mask.u8[8 + 2]),
676 (a.u8[8 + 3] & (~mask.u8[8 + 3])) | (b.u8[8 + 3] & mask.u8[8 + 3]),
677 (a.u8[8 + 4] & (~mask.u8[8 + 4])) | (b.u8[8 + 4] & mask.u8[8 + 4]),
678 (a.u8[8 + 5] & (~mask.u8[8 + 5])) | (b.u8[8 + 5] & mask.u8[8 + 5]),
679 (a.u8[8 + 6] & (~mask.u8[8 + 6])) | (b.u8[8 + 6] & mask.u8[8 + 6]),
680 (a.u8[8 + 7] & (~mask.u8[8 + 7])) | (b.u8[8 + 7] & mask.u8[8 + 7]));
683 inline Scalari32x4_t Pick(Scalari32x4_t mask, Scalari32x4_t a,
684 Scalari32x4_t b) {
685 return From32<Scalari32x4_t>(
686 (a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
687 (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
688 (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
689 (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
692 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) {
693 return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
694 a.f32[1] + (b.f32[1] - a.f32[1]) * t,
695 a.f32[2] + (b.f32[2] - a.f32[2]) * t,
696 a.f32[3] + (b.f32[3] - a.f32[3]) * t);
699 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa,
700 float wb) {
701 return FromF32<Scalarf32x4_t>(
702 a.f32[0] * wa + b.f32[0] * wb, a.f32[1] * wa + b.f32[1] * wb,
703 a.f32[2] * wa + b.f32[2] * wb, a.f32[3] * wa + b.f32[3] * wb);
706 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) {
707 return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), fabs(a.f32[1]), fabs(a.f32[2]),
708 fabs(a.f32[3]));
711 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) {
712 return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], a.f32[1] + b.f32[1],
713 a.f32[2] + b.f32[2], a.f32[3] + b.f32[3]);
716 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) {
717 return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], a.f32[1] * b.f32[1],
718 a.f32[2] * b.f32[2], a.f32[3] * b.f32[3]);
721 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) {
722 return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], a.f32[1] / b.f32[1],
723 a.f32[2] / b.f32[2], a.f32[3] / b.f32[3]);
726 template <uint8_t aIndex>
727 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) {
728 AssertIndex<aIndex>();
729 return FromF32<Scalarf32x4_t>(m.f32[aIndex], m.f32[aIndex], m.f32[aIndex],
730 m.f32[aIndex]);
733 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) {
734 return From32<Scalari32x4_t>(
735 int32_t(floor(m.f32[0] + 0.5f)), int32_t(floor(m.f32[1] + 0.5f)),
736 int32_t(floor(m.f32[2] + 0.5f)), int32_t(floor(m.f32[3] + 0.5f)));
739 #ifdef SIMD_COMPILE_SSE2
741 // SSE2
743 template <>
744 inline __m128i Load8<__m128i>(const uint8_t* aSource) {
745 return _mm_load_si128((const __m128i*)aSource);
748 inline void Store8(uint8_t* aTarget, __m128i aM) {
749 _mm_store_si128((__m128i*)aTarget, aM);
752 template <>
753 inline __m128i FromZero8<__m128i>() {
754 return _mm_setzero_si128();
757 template <>
758 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
759 uint8_t e, uint8_t f, uint8_t g, uint8_t h,
760 uint8_t i, uint8_t j, uint8_t k, uint8_t l,
761 uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
762 return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
763 (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
766 template <>
767 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d,
768 int16_t e, int16_t f, int16_t g, int16_t h) {
769 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
772 template <>
773 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
774 uint16_t e, uint16_t f, uint16_t g,
775 uint16_t h) {
776 return _mm_setr_epi16(a, b, c, d, e, f, g, h);
779 template <>
780 inline __m128i FromI16<__m128i>(int16_t a) {
781 return _mm_set1_epi16(a);
784 template <>
785 inline __m128i FromU16<__m128i>(uint16_t a) {
786 return _mm_set1_epi16((int16_t)a);
789 template <>
790 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) {
791 return _mm_setr_epi32(a, b, c, d);
794 template <>
795 inline __m128i From32<__m128i>(int32_t a) {
796 return _mm_set1_epi32(a);
799 template <>
800 inline __m128 FromF32<__m128>(float a, float b, float c, float d) {
801 return _mm_setr_ps(a, b, c, d);
804 template <>
805 inline __m128 FromF32<__m128>(float a) {
806 return _mm_set1_ps(a);
809 template <int32_t aNumberOfBits>
810 inline __m128i ShiftRight16(__m128i aM) {
811 return _mm_srli_epi16(aM, aNumberOfBits);
814 template <int32_t aNumberOfBits>
815 inline __m128i ShiftRight32(__m128i aM) {
816 return _mm_srai_epi32(aM, aNumberOfBits);
819 inline __m128i Add16(__m128i aM1, __m128i aM2) {
820 return _mm_add_epi16(aM1, aM2);
823 inline __m128i Add32(__m128i aM1, __m128i aM2) {
824 return _mm_add_epi32(aM1, aM2);
827 inline __m128i Sub16(__m128i aM1, __m128i aM2) {
828 return _mm_sub_epi16(aM1, aM2);
831 inline __m128i Sub32(__m128i aM1, __m128i aM2) {
832 return _mm_sub_epi32(aM1, aM2);
835 inline __m128i Min8(__m128i aM1, __m128i aM2) { return _mm_min_epu8(aM1, aM2); }
837 inline __m128i Max8(__m128i aM1, __m128i aM2) { return _mm_max_epu8(aM1, aM2); }
839 inline __m128i Min32(__m128i aM1, __m128i aM2) {
840 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
841 __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
842 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
845 inline __m128i Max32(__m128i aM1, __m128i aM2) {
846 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
847 __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
848 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
851 inline __m128i Mul16(__m128i aM1, __m128i aM2) {
852 return _mm_mullo_epi16(aM1, aM2);
855 inline __m128i MulU16(__m128i aM1, __m128i aM2) {
856 return _mm_mullo_epi16(aM1, aM2);
859 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, __m128i aFactorsA2B2,
860 __m128i& aProductA, __m128i& aProductB) {
861 __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
862 __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
863 aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
864 aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
867 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, __m128i aFactorsB) {
868 return _mm_madd_epi16(aFactorsA, aFactorsB);
871 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
872 inline __m128i Shuffle32(__m128i aM) {
873 AssertIndex<i0>();
874 AssertIndex<i1>();
875 AssertIndex<i2>();
876 AssertIndex<i3>();
877 return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
880 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
881 inline __m128i ShuffleLo16(__m128i aM) {
882 AssertIndex<i0>();
883 AssertIndex<i1>();
884 AssertIndex<i2>();
885 AssertIndex<i3>();
886 return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
889 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
890 inline __m128i ShuffleHi16(__m128i aM) {
891 AssertIndex<i0>();
892 AssertIndex<i1>();
893 AssertIndex<i2>();
894 AssertIndex<i3>();
895 return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
898 template <int8_t aIndex>
899 inline __m128i Splat32(__m128i aM) {
900 return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM);
903 template <int8_t aIndex>
904 inline __m128i Splat32On8(__m128i aM) {
905 return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM);
908 template <int8_t aIndexLo, int8_t aIndexHi>
909 inline __m128i Splat16(__m128i aM) {
910 AssertIndex<aIndexLo>();
911 AssertIndex<aIndexHi>();
912 return ShuffleHi16<aIndexHi, aIndexHi, aIndexHi, aIndexHi>(
913 ShuffleLo16<aIndexLo, aIndexLo, aIndexLo, aIndexLo>(aM));
916 inline __m128i UnpackLo8x8ToI16x8(__m128i m) {
917 __m128i zero = _mm_set1_epi8(0);
918 return _mm_unpacklo_epi8(m, zero);
921 inline __m128i UnpackHi8x8ToI16x8(__m128i m) {
922 __m128i zero = _mm_set1_epi8(0);
923 return _mm_unpackhi_epi8(m, zero);
926 inline __m128i UnpackLo8x8ToU16x8(__m128i m) {
927 __m128i zero = _mm_set1_epi8(0);
928 return _mm_unpacklo_epi8(m, zero);
931 inline __m128i UnpackHi8x8ToU16x8(__m128i m) {
932 __m128i zero = _mm_set1_epi8(0);
933 return _mm_unpackhi_epi8(m, zero);
936 inline __m128i InterleaveLo8(__m128i m1, __m128i m2) {
937 return _mm_unpacklo_epi8(m1, m2);
940 inline __m128i InterleaveHi8(__m128i m1, __m128i m2) {
941 return _mm_unpackhi_epi8(m1, m2);
944 inline __m128i InterleaveLo16(__m128i m1, __m128i m2) {
945 return _mm_unpacklo_epi16(m1, m2);
948 inline __m128i InterleaveHi16(__m128i m1, __m128i m2) {
949 return _mm_unpackhi_epi16(m1, m2);
952 inline __m128i InterleaveLo32(__m128i m1, __m128i m2) {
953 return _mm_unpacklo_epi32(m1, m2);
956 template <uint8_t aNumBytes>
957 inline __m128i Rotate8(__m128i a1234, __m128i a5678) {
958 return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes),
959 _mm_slli_si128(a5678, 16 - aNumBytes));
962 inline __m128i PackAndSaturate32To16(__m128i m1, __m128i m2) {
963 return _mm_packs_epi32(m1, m2);
966 inline __m128i PackAndSaturate32ToU16(__m128i m1, __m128i m2) {
967 return _mm_packs_epi32(m1, m2);
970 inline __m128i PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3,
971 const __m128i& m4) {
972 // Pack into 8 16bit signed integers (saturating).
973 __m128i m12 = _mm_packs_epi32(m1, m2);
974 __m128i m34 = _mm_packs_epi32(m3, m4);
976 // Pack into 16 8bit unsigned integers (saturating).
977 return _mm_packus_epi16(m12, m34);
980 inline __m128i PackAndSaturate16To8(__m128i m1, __m128i m2) {
981 // Pack into 16 8bit unsigned integers (saturating).
982 return _mm_packus_epi16(m1, m2);
985 inline __m128i FastDivideBy255(__m128i m) {
986 // v = m << 8
987 __m128i v = _mm_slli_epi32(m, 8);
988 // v = v + (m + (255,255,255,255))
989 v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
990 // v = v >> 16
991 return _mm_srai_epi32(v, 16);
994 inline __m128i FastDivideBy255_16(__m128i m) {
995 __m128i zero = _mm_set1_epi16(0);
996 __m128i lo = _mm_unpacklo_epi16(m, zero);
997 __m128i hi = _mm_unpackhi_epi16(m, zero);
998 return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
1001 inline __m128i Pick(__m128i mask, __m128i a, __m128i b) {
1002 return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
1005 inline __m128 MixF32(__m128 a, __m128 b, float t) {
1006 return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
1009 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) {
1010 return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)),
1011 _mm_mul_ps(b, _mm_set1_ps(wb)));
1014 inline __m128 AbsF32(__m128 a) {
1015 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
1018 inline __m128 AddF32(__m128 a, __m128 b) { return _mm_add_ps(a, b); }
1020 inline __m128 MulF32(__m128 a, __m128 b) { return _mm_mul_ps(a, b); }
1022 inline __m128 DivF32(__m128 a, __m128 b) { return _mm_div_ps(a, b); }
1024 template <uint8_t aIndex>
1025 inline __m128 SplatF32(__m128 m) {
1026 AssertIndex<aIndex>();
1027 return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
1030 inline __m128i F32ToI32(__m128 m) { return _mm_cvtps_epi32(m); }
1032 #endif // SIMD_COMPILE_SSE2
1034 } // namespace simd
1036 } // namespace gfx
1037 } // namespace mozilla
1039 #endif // _MOZILLA_GFX_SIMD_H_