1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 /* This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
7 #include "ImageScaling.h"
8 #include "mozilla/Attributes.h"
10 #include "SSEHelpers.h"
12 /* The functions below use the following system for averaging 4 pixels:
14 * The first observation is that a half-adder is implemented as follows:
15 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
17 * This can be trivially extended to three pixels by observaring that when
18 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
19 * carries of the individual numbers, since the sum of 3 bits can only ever
20 * have a carry of one.
22 * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
23 * assuming eliminating overflows and underflows, carry + (sum >> 1).
25 * We now average our existing sum with the fourth number, so we get:
26 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
28 * We now observe that our sum has been moved into place relative to the
29 * carry, so we can now average with the carry to get the final 4 input
30 * average: avg = (sum2 + carry) >> 1;
32 * Or to reverse the proof:
33 * avg = ((sum >> 1) + carry + d >> 1) >> 1
34 * avg = ((a + b + c) >> 1 + d >> 1) >> 1
35 * avg = ((a + b + c + d) >> 2)
37 * An additional fact used in the SSE versions is the concept that we can
38 * trivially convert a rounded average to a truncated average:
41 * f(a, b) = (a + b + 1) >> 1
44 * g(a, b) = (a + b) >> 1
47 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
48 * == ~((-a - 1 + -b - 1 + 1) >> 1)
49 * == ~((-a - 1 + -b) >> 1)
50 * == ~((-(a + b) - 1) >> 1)
51 * == ~((~(a + b)) >> 1)
56 MOZ_ALWAYS_INLINE __m128i
_mm_not_si128(__m128i arg
) {
57 __m128i minusone
= _mm_set1_epi32(0xffffffff);
58 return _mm_xor_si128(arg
, minusone
);
61 /* We have to pass pointers here, MSVC does not allow passing more than 3
62 * __m128i arguments on the stack. And it does not allow 16-byte aligned
63 * stack variables. This inlines properly on MSVC 2010. It does -not- inline
64 * with just the inline directive.
66 MOZ_ALWAYS_INLINE __m128i
avg_sse2_8x2(__m128i
* a
, __m128i
* b
, __m128i
* c
,
68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
72 // needs to be a compile time constant.
73 #define shuffle_si128(arga, argb, imm) \
74 _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \
75 _mm_castsi128_ps((argb)), (imm)));
77 __m128i t
= shuffle_si128(*a
, *b
, shuf1
);
78 *b
= shuffle_si128(*a
, *b
, shuf2
);
80 t
= shuffle_si128(*c
, *d
, shuf1
);
81 *d
= shuffle_si128(*c
, *d
, shuf2
);
88 __m128i sum
= _mm_xor_si128(*a
, _mm_xor_si128(*b
, *c
));
91 _mm_or_si128(_mm_and_si128(*a
, *b
),
92 _mm_or_si128(_mm_and_si128(*a
, *c
), _mm_and_si128(*b
, *c
)));
94 sum
= _mm_avg_epu8(_mm_not_si128(sum
), _mm_not_si128(*d
));
96 return _mm_not_si128(_mm_avg_epu8(sum
, _mm_not_si128(carry
)));
99 MOZ_ALWAYS_INLINE __m128i
avg_sse2_4x2_4x1(__m128i a
, __m128i b
) {
100 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a
), _mm_not_si128(b
)));
103 MOZ_ALWAYS_INLINE __m128i
avg_sse2_8x1_4x1(__m128i a
, __m128i b
) {
104 __m128i t
= _mm_castps_si128(_mm_shuffle_ps(
105 _mm_castsi128_ps(a
), _mm_castsi128_ps(b
), _MM_SHUFFLE(3, 1, 3, 1)));
106 b
= _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a
), _mm_castsi128_ps(b
),
107 _MM_SHUFFLE(2, 0, 2, 0)));
110 return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a
), _mm_not_si128(b
)));
113 MOZ_ALWAYS_INLINE
uint32_t Avg2x2(uint32_t a
, uint32_t b
, uint32_t c
,
115 uint32_t sum
= a
^ b
^ c
;
116 uint32_t carry
= (a
& b
) | (a
& c
) | (b
& c
);
118 uint32_t mask
= 0xfefefefe;
120 // Not having a byte based average instruction means we should mask to avoid
122 sum
= (((sum
^ d
) & mask
) >> 1) + (sum
& d
);
124 return (((sum
^ carry
) & mask
) >> 1) + (sum
& carry
);
127 // Simple 2 pixel average version of the function above.
128 MOZ_ALWAYS_INLINE
uint32_t Avg2(uint32_t a
, uint32_t b
) {
129 uint32_t sum
= a
^ b
;
130 uint32_t carry
= (a
& b
);
132 uint32_t mask
= 0xfefefefe;
134 return ((sum
& mask
) >> 1) + carry
;
137 namespace mozilla::gfx
{
139 void ImageHalfScaler::HalfImage2D_SSE2(uint8_t* aSource
, int32_t aSourceStride
,
140 const IntSize
& aSourceSize
,
141 uint8_t* aDest
, uint32_t aDestStride
) {
144 for (int y
= 0; y
< aSourceSize
.height
; y
+= 2) {
145 __m128i
* storage
= (__m128i
*)(aDest
+ (y
/ 2) * aDestStride
);
147 // Run a loop depending on alignment.
148 if (!(uintptr_t(aSource
+ (y
* aSourceStride
)) % 16) &&
149 !(uintptr_t(aSource
+ ((y
+ 1) * aSourceStride
)) % 16)) {
150 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
151 __m128i
* upperRow
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* Bpp
));
153 (__m128i
*)(aSource
+ ((y
+ 1) * aSourceStride
+ x
* Bpp
));
155 __m128i a
= _mm_load_si128(upperRow
);
156 __m128i b
= _mm_load_si128(upperRow
+ 1);
157 __m128i c
= _mm_load_si128(lowerRow
);
158 __m128i d
= _mm_load_si128(lowerRow
+ 1);
160 *storage
++ = avg_sse2_8x2(&a
, &b
, &c
, &d
);
162 } else if (!(uintptr_t(aSource
+ (y
* aSourceStride
)) % 16)) {
163 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
164 __m128i
* upperRow
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* Bpp
));
166 (__m128i
*)(aSource
+ ((y
+ 1) * aSourceStride
+ x
* Bpp
));
168 __m128i a
= _mm_load_si128(upperRow
);
169 __m128i b
= _mm_load_si128(upperRow
+ 1);
170 __m128i c
= loadUnaligned128(lowerRow
);
171 __m128i d
= loadUnaligned128(lowerRow
+ 1);
173 *storage
++ = avg_sse2_8x2(&a
, &b
, &c
, &d
);
175 } else if (!(uintptr_t(aSource
+ ((y
+ 1) * aSourceStride
)) % 16)) {
176 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
177 __m128i
* upperRow
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* Bpp
));
179 (__m128i
*)(aSource
+ ((y
+ 1) * aSourceStride
+ x
* Bpp
));
181 __m128i a
= loadUnaligned128((__m128i
*)upperRow
);
182 __m128i b
= loadUnaligned128((__m128i
*)upperRow
+ 1);
183 __m128i c
= _mm_load_si128((__m128i
*)lowerRow
);
184 __m128i d
= _mm_load_si128((__m128i
*)lowerRow
+ 1);
186 *storage
++ = avg_sse2_8x2(&a
, &b
, &c
, &d
);
189 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
190 __m128i
* upperRow
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* Bpp
));
192 (__m128i
*)(aSource
+ ((y
+ 1) * aSourceStride
+ x
* Bpp
));
194 __m128i a
= loadUnaligned128(upperRow
);
195 __m128i b
= loadUnaligned128(upperRow
+ 1);
196 __m128i c
= loadUnaligned128(lowerRow
);
197 __m128i d
= loadUnaligned128(lowerRow
+ 1);
199 *storage
++ = avg_sse2_8x2(&a
, &b
, &c
, &d
);
203 uint32_t* unalignedStorage
= (uint32_t*)storage
;
204 // Take care of the final pixels, we know there's an even number of pixels
205 // in the source rectangle. We use a 2x2 'simd' implementation for this.
207 // Potentially we only have to do this in the last row since overflowing
208 // 8 pixels in an earlier row would appear to be harmless as it doesn't
209 // touch invalid memory. Even when reading and writing to the same surface.
210 // in practice we only do this when doing an additional downscale pass, and
211 // in this situation we have unused stride to write into harmlessly.
212 // I do not believe the additional code complexity would be worth it though.
213 for (; x
< aSourceSize
.width
; x
+= 2) {
214 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* Bpp
);
215 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* Bpp
);
217 *unalignedStorage
++ =
218 Avg2x2(*(uint32_t*)upperRow
, *((uint32_t*)upperRow
+ 1),
219 *(uint32_t*)lowerRow
, *((uint32_t*)lowerRow
+ 1));
224 void ImageHalfScaler::HalfImageVertical_SSE2(uint8_t* aSource
,
225 int32_t aSourceStride
,
226 const IntSize
& aSourceSize
,
228 uint32_t aDestStride
) {
229 for (int y
= 0; y
< aSourceSize
.height
; y
+= 2) {
230 __m128i
* storage
= (__m128i
*)(aDest
+ (y
/ 2) * aDestStride
);
232 // Run a loop depending on alignment.
233 if (!(uintptr_t(aSource
+ (y
* aSourceStride
)) % 16) &&
234 !(uintptr_t(aSource
+ ((y
+ 1) * aSourceStride
)) % 16)) {
235 for (; x
< (aSourceSize
.width
- 3); x
+= 4) {
236 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* 4);
237 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* 4);
239 __m128i a
= _mm_load_si128((__m128i
*)upperRow
);
240 __m128i b
= _mm_load_si128((__m128i
*)lowerRow
);
242 *storage
++ = avg_sse2_4x2_4x1(a
, b
);
244 } else if (!(uintptr_t(aSource
+ (y
* aSourceStride
)) % 16)) {
245 // This line doesn't align well.
246 for (; x
< (aSourceSize
.width
- 3); x
+= 4) {
247 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* 4);
248 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* 4);
250 __m128i a
= _mm_load_si128((__m128i
*)upperRow
);
251 __m128i b
= loadUnaligned128((__m128i
*)lowerRow
);
253 *storage
++ = avg_sse2_4x2_4x1(a
, b
);
255 } else if (!(uintptr_t(aSource
+ ((y
+ 1) * aSourceStride
)) % 16)) {
256 for (; x
< (aSourceSize
.width
- 3); x
+= 4) {
257 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* 4);
258 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* 4);
260 __m128i a
= loadUnaligned128((__m128i
*)upperRow
);
261 __m128i b
= _mm_load_si128((__m128i
*)lowerRow
);
263 *storage
++ = avg_sse2_4x2_4x1(a
, b
);
266 for (; x
< (aSourceSize
.width
- 3); x
+= 4) {
267 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* 4);
268 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* 4);
270 __m128i a
= loadUnaligned128((__m128i
*)upperRow
);
271 __m128i b
= loadUnaligned128((__m128i
*)lowerRow
);
273 *storage
++ = avg_sse2_4x2_4x1(a
, b
);
277 uint32_t* unalignedStorage
= (uint32_t*)storage
;
278 // Take care of the final pixels, we know there's an even number of pixels
279 // in the source rectangle.
281 // Similar overflow considerations are valid as in the previous function.
282 for (; x
< aSourceSize
.width
; x
++) {
283 uint8_t* upperRow
= aSource
+ (y
* aSourceStride
+ x
* 4);
284 uint8_t* lowerRow
= aSource
+ ((y
+ 1) * aSourceStride
+ x
* 4);
286 *unalignedStorage
++ = Avg2(*(uint32_t*)upperRow
, *(uint32_t*)lowerRow
);
291 void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource
,
292 int32_t aSourceStride
,
293 const IntSize
& aSourceSize
,
295 uint32_t aDestStride
) {
296 for (int y
= 0; y
< aSourceSize
.height
; y
++) {
297 __m128i
* storage
= (__m128i
*)(aDest
+ (y
* aDestStride
));
299 // Run a loop depending on alignment.
300 if (!(uintptr_t(aSource
+ (y
* aSourceStride
)) % 16)) {
301 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
302 __m128i
* pixels
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* 4));
304 __m128i a
= _mm_load_si128(pixels
);
305 __m128i b
= _mm_load_si128(pixels
+ 1);
307 *storage
++ = avg_sse2_8x1_4x1(a
, b
);
310 for (; x
< (aSourceSize
.width
- 7); x
+= 8) {
311 __m128i
* pixels
= (__m128i
*)(aSource
+ (y
* aSourceStride
+ x
* 4));
313 __m128i a
= loadUnaligned128(pixels
);
314 __m128i b
= loadUnaligned128(pixels
+ 1);
316 *storage
++ = avg_sse2_8x1_4x1(a
, b
);
320 uint32_t* unalignedStorage
= (uint32_t*)storage
;
321 // Take care of the final pixels, we know there's an even number of pixels
322 // in the source rectangle.
324 // Similar overflow considerations are valid as in the previous function.
325 for (; x
< aSourceSize
.width
; x
+= 2) {
326 uint32_t* pixels
= (uint32_t*)(aSource
+ (y
* aSourceStride
+ x
* 4));
328 *unalignedStorage
++ = Avg2(*pixels
, *(pixels
+ 1));
333 } // namespace mozilla::gfx