1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
8 #include "mozilla/Attributes.h"
13 static MOZ_ALWAYS_INLINE
void AccumRemainder(
14 const unsigned char* pixelsLeft
,
15 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
, __m128i
& accum
,
17 int remainder
[4] = {0};
18 for (int i
= 0; i
< r
; i
++) {
19 SkConvolutionFilter1D::ConvolutionFixed coeff
= filterValues
[i
];
20 remainder
[0] += coeff
* pixelsLeft
[i
* 4 + 0];
21 remainder
[1] += coeff
* pixelsLeft
[i
* 4 + 1];
22 remainder
[2] += coeff
* pixelsLeft
[i
* 4 + 2];
23 remainder
[3] += coeff
* pixelsLeft
[i
* 4 + 3];
26 _mm_setr_epi32(remainder
[0], remainder
[1], remainder
[2], remainder
[3]);
27 accum
= _mm_add_epi32(accum
, t
);
30 // Convolves horizontally along a single row. The row data is given in
31 // |srcData| and continues for the numValues() of the filter.
32 void convolve_horizontally_sse2(const unsigned char* srcData
,
33 const SkConvolutionFilter1D
& filter
,
34 unsigned char* outRow
, bool /*hasAlpha*/) {
35 // Output one pixel each iteration, calculating all channels (RGBA) together.
36 int numValues
= filter
.numValues();
37 for (int outX
= 0; outX
< numValues
; outX
++) {
38 // Get the filter that determines the current output pixel.
39 int filterOffset
, filterLength
;
40 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
=
41 filter
.FilterForValue(outX
, &filterOffset
, &filterLength
);
43 // Compute the first pixel in this row that the filter affects. It will
44 // touch |filterLength| pixels (4 bytes each) after this.
45 const unsigned char* rowToFilter
= &srcData
[filterOffset
* 4];
47 __m128i zero
= _mm_setzero_si128();
48 __m128i accum
= _mm_setzero_si128();
50 // We will load and accumulate with four coefficients per iteration.
51 for (int filterX
= 0; filterX
< filterLength
>> 2; filterX
++) {
52 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
53 __m128i coeff
, coeff16
;
54 // [16] xx xx xx xx c3 c2 c1 c0
55 coeff
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(filterValues
));
56 // [16] xx xx xx xx c1 c1 c0 c0
57 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(1, 1, 0, 0));
58 // [16] c1 c1 c1 c1 c0 c0 c0 c0
59 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
61 // Load four pixels => unpack the first two pixels to 16 bits =>
62 // multiply with coefficients => accumulate the convolution result.
63 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
65 _mm_loadu_si128(reinterpret_cast<const __m128i
*>(rowToFilter
));
66 // [16] a1 b1 g1 r1 a0 b0 g0 r0
67 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
68 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
69 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
70 // [32] a0*c0 b0*c0 g0*c0 r0*c0
71 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
72 accum
= _mm_add_epi32(accum
, t
);
73 // [32] a1*c1 b1*c1 g1*c1 r1*c1
74 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
75 accum
= _mm_add_epi32(accum
, t
);
77 // Duplicate 3rd and 4th coefficients for all channels =>
78 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
79 // => accumulate the convolution results.
80 // [16] xx xx xx xx c3 c3 c2 c2
81 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(3, 3, 2, 2));
82 // [16] c3 c3 c3 c3 c2 c2 c2 c2
83 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
84 // [16] a3 g3 b3 r3 a2 g2 b2 r2
85 src16
= _mm_unpackhi_epi8(src8
, zero
);
86 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
87 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
88 // [32] a2*c2 b2*c2 g2*c2 r2*c2
89 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
90 accum
= _mm_add_epi32(accum
, t
);
91 // [32] a3*c3 b3*c3 g3*c3 r3*c3
92 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
93 accum
= _mm_add_epi32(accum
, t
);
95 // Advance the pixel and coefficients pointers.
100 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
101 // coefficients one at a time.
102 int r
= filterLength
& 3;
104 int remainderOffset
= (filterOffset
+ filterLength
- r
) * 4;
105 AccumRemainder(srcData
+ remainderOffset
, filterValues
, accum
, r
);
108 // Shift right for fixed point implementation.
109 accum
= _mm_srai_epi32(accum
, SkConvolutionFilter1D::kShiftBits
);
111 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
112 accum
= _mm_packs_epi32(accum
, zero
);
113 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
114 accum
= _mm_packus_epi16(accum
, zero
);
116 // Store the pixel value of 32 bits.
117 *(reinterpret_cast<int*>(outRow
)) = _mm_cvtsi128_si32(accum
);
122 // Does vertical convolution to produce one output row. The filter values and
123 // length are given in the first two parameters. These are applied to each
124 // of the rows pointed to in the |sourceDataRows| array, with each row
125 // being |pixelWidth| wide.
127 // The output must have room for |pixelWidth * 4| bytes.
128 template <bool hasAlpha
>
129 static void ConvolveVertically(
130 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
131 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
132 unsigned char* outRow
) {
133 // Output four pixels per iteration (16 bytes).
134 int width
= pixelWidth
& ~3;
135 __m128i zero
= _mm_setzero_si128();
136 for (int outX
= 0; outX
< width
; outX
+= 4) {
137 // Accumulated result for each pixel. 32 bits per RGBA channel.
138 __m128i accum0
= _mm_setzero_si128();
139 __m128i accum1
= _mm_setzero_si128();
140 __m128i accum2
= _mm_setzero_si128();
141 __m128i accum3
= _mm_setzero_si128();
143 // Convolve with one filter coefficient per iteration.
144 for (int filterY
= 0; filterY
< filterLength
; filterY
++) {
145 // Duplicate the filter coefficient 8 times.
146 // [16] cj cj cj cj cj cj cj cj
147 __m128i coeff16
= _mm_set1_epi16(filterValues
[filterY
]);
149 // Load four pixels (16 bytes) together.
150 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
152 reinterpret_cast<const __m128i
*>(&sourceDataRows
[filterY
][outX
<< 2]);
153 __m128i src8
= _mm_loadu_si128(src
);
155 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
156 // multiply with current coefficient => accumulate the result.
157 // [16] a1 b1 g1 r1 a0 b0 g0 r0
158 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
159 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
160 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
162 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
163 accum0
= _mm_add_epi32(accum0
, t
);
165 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
166 accum1
= _mm_add_epi32(accum1
, t
);
168 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
169 // multiply with current coefficient => accumulate the result.
170 // [16] a3 b3 g3 r3 a2 b2 g2 r2
171 src16
= _mm_unpackhi_epi8(src8
, zero
);
172 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
173 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
175 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
176 accum2
= _mm_add_epi32(accum2
, t
);
178 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
179 accum3
= _mm_add_epi32(accum3
, t
);
182 // Shift right for fixed point implementation.
183 accum0
= _mm_srai_epi32(accum0
, SkConvolutionFilter1D::kShiftBits
);
184 accum1
= _mm_srai_epi32(accum1
, SkConvolutionFilter1D::kShiftBits
);
185 accum2
= _mm_srai_epi32(accum2
, SkConvolutionFilter1D::kShiftBits
);
186 accum3
= _mm_srai_epi32(accum3
, SkConvolutionFilter1D::kShiftBits
);
188 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
189 // [16] a1 b1 g1 r1 a0 b0 g0 r0
190 accum0
= _mm_packs_epi32(accum0
, accum1
);
191 // [16] a3 b3 g3 r3 a2 b2 g2 r2
192 accum2
= _mm_packs_epi32(accum2
, accum3
);
194 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
195 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
196 accum0
= _mm_packus_epi16(accum0
, accum2
);
199 // Compute the max(ri, gi, bi) for each pixel.
200 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
201 __m128i a
= _mm_srli_epi32(accum0
, 8);
202 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
203 __m128i b
= _mm_max_epu8(a
, accum0
); // Max of r and g.
204 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
205 a
= _mm_srli_epi32(accum0
, 16);
206 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
207 b
= _mm_max_epu8(a
, b
); // Max of r and g and b.
208 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
209 b
= _mm_slli_epi32(b
, 24);
211 // Make sure the value of alpha channel is always larger than maximum
212 // value of color channels.
213 accum0
= _mm_max_epu8(b
, accum0
);
215 // Set value of alpha channels to 0xFF.
216 __m128i mask
= _mm_set1_epi32(0xff000000);
217 accum0
= _mm_or_si128(accum0
, mask
);
220 // Store the convolution result (16 bytes) and advance the pixel pointers.
221 _mm_storeu_si128(reinterpret_cast<__m128i
*>(outRow
), accum0
);
225 // When the width of the output is not divisible by 4, We need to save one
226 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
227 int r
= pixelWidth
& 3;
229 __m128i accum0
= _mm_setzero_si128();
230 __m128i accum1
= _mm_setzero_si128();
231 __m128i accum2
= _mm_setzero_si128();
232 for (int filterY
= 0; filterY
< filterLength
; ++filterY
) {
233 __m128i coeff16
= _mm_set1_epi16(filterValues
[filterY
]);
234 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
235 const __m128i
* src
= reinterpret_cast<const __m128i
*>(
236 &sourceDataRows
[filterY
][width
<< 2]);
237 __m128i src8
= _mm_loadu_si128(src
);
238 // [16] a1 b1 g1 r1 a0 b0 g0 r0
239 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
240 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
241 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
243 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
244 accum0
= _mm_add_epi32(accum0
, t
);
246 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
247 accum1
= _mm_add_epi32(accum1
, t
);
248 // [16] a3 b3 g3 r3 a2 b2 g2 r2
249 src16
= _mm_unpackhi_epi8(src8
, zero
);
250 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
251 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
253 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
254 accum2
= _mm_add_epi32(accum2
, t
);
257 accum0
= _mm_srai_epi32(accum0
, SkConvolutionFilter1D::kShiftBits
);
258 accum1
= _mm_srai_epi32(accum1
, SkConvolutionFilter1D::kShiftBits
);
259 accum2
= _mm_srai_epi32(accum2
, SkConvolutionFilter1D::kShiftBits
);
260 // [16] a1 b1 g1 r1 a0 b0 g0 r0
261 accum0
= _mm_packs_epi32(accum0
, accum1
);
262 // [16] a3 b3 g3 r3 a2 b2 g2 r2
263 accum2
= _mm_packs_epi32(accum2
, zero
);
264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
265 accum0
= _mm_packus_epi16(accum0
, accum2
);
267 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
268 __m128i a
= _mm_srli_epi32(accum0
, 8);
269 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
270 __m128i b
= _mm_max_epu8(a
, accum0
); // Max of r and g.
271 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
272 a
= _mm_srli_epi32(accum0
, 16);
273 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
274 b
= _mm_max_epu8(a
, b
); // Max of r and g and b.
275 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
276 b
= _mm_slli_epi32(b
, 24);
277 accum0
= _mm_max_epu8(b
, accum0
);
279 __m128i mask
= _mm_set1_epi32(0xff000000);
280 accum0
= _mm_or_si128(accum0
, mask
);
283 for (int i
= 0; i
< r
; i
++) {
284 *(reinterpret_cast<int*>(outRow
)) = _mm_cvtsi128_si32(accum0
);
285 accum0
= _mm_srli_si128(accum0
, 4);
291 void convolve_vertically_sse2(
292 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
293 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
294 unsigned char* outRow
, bool hasAlpha
) {
296 ConvolveVertically
<true>(filterValues
, filterLength
, sourceDataRows
,
299 ConvolveVertically
<false>(filterValues
, filterLength
, sourceDataRows
,