1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
7 #include "skia/ext/convolver.h"
8 #include "third_party/skia/include/core/SkTypes.h"
10 #if defined(SIMD_SSE2)
11 #include <emmintrin.h> // ARCH_CPU_X86_FAMILY was defined in build/config.h
18 // Converts the argument to an 8-bit unsigned value by clamping to the range
20 inline unsigned char ClampTo8(int a
) {
21 if (static_cast<unsigned>(a
) < 256)
22 return a
; // Avoid the extra check in the common case.
28 // Stores a list of rows in a circular buffer. The usage is you write into it
29 // by calling AdvanceRow. It will keep track of which row in the buffer it
30 // should use next, and the total number of rows added.
31 class CircularRowBuffer
{
33 // The number of pixels in each row is given in |source_row_pixel_width|.
34 // The maximum number of rows needed in the buffer is |max_y_filter_size|
35 // (we only need to store enough rows for the biggest filter).
37 // We use the |first_input_row| to compute the coordinates of all of the
38 // following rows returned by Advance().
39 CircularRowBuffer(int dest_row_pixel_width
, int max_y_filter_size
,
41 : row_byte_width_(dest_row_pixel_width
* 4),
42 num_rows_(max_y_filter_size
),
44 next_row_coordinate_(first_input_row
) {
45 buffer_
.resize(row_byte_width_
* max_y_filter_size
);
46 row_addresses_
.resize(num_rows_
);
49 // Moves to the next row in the buffer, returning a pointer to the beginning
51 unsigned char* AdvanceRow() {
52 unsigned char* row
= &buffer_
[next_row_
* row_byte_width_
];
53 next_row_coordinate_
++;
55 // Set the pointer to the next row to use, wrapping around if necessary.
57 if (next_row_
== num_rows_
)
62 // Returns a pointer to an "unrolled" array of rows. These rows will start
63 // at the y coordinate placed into |*first_row_index| and will continue in
64 // order for the maximum number of rows in this circular buffer.
66 // The |first_row_index_| may be negative. This means the circular buffer
67 // starts before the top of the image (it hasn't been filled yet).
68 unsigned char* const* GetRowAddresses(int* first_row_index
) {
69 // Example for a 4-element circular buffer holding coords 6-9.
72 // Row 2 Coord 6 <- next_row_ = 2, next_row_coordinate_ = 10.
75 // The "next" row is also the first (lowest) coordinate. This computation
76 // may yield a negative value, but that's OK, the math will work out
77 // since the user of this buffer will compute the offset relative
78 // to the first_row_index and the negative rows will never be used.
79 *first_row_index
= next_row_coordinate_
- num_rows_
;
81 int cur_row
= next_row_
;
82 for (int i
= 0; i
< num_rows_
; i
++) {
83 row_addresses_
[i
] = &buffer_
[cur_row
* row_byte_width_
];
85 // Advance to the next row, wrapping if necessary.
87 if (cur_row
== num_rows_
)
90 return &row_addresses_
[0];
94 // The buffer storing the rows. They are packed, each one row_byte_width_.
95 std::vector
<unsigned char> buffer_
;
97 // Number of bytes per row in the |buffer_|.
100 // The number of rows available in the buffer.
103 // The next row index we should write into. This wraps around as the
104 // circular buffer is used.
107 // The y coordinate of the |next_row_|. This is incremented each time a
108 // new row is appended and does not wrap.
109 int next_row_coordinate_
;
111 // Buffer used by GetRowAddresses().
112 std::vector
<unsigned char*> row_addresses_
;
115 // Convolves horizontally along a single row. The row data is given in
116 // |src_data| and continues for the num_values() of the filter.
117 template<bool has_alpha
>
118 void ConvolveHorizontally(const unsigned char* src_data
,
119 const ConvolutionFilter1D
& filter
,
120 unsigned char* out_row
) {
121 // Loop over each pixel on this row in the output image.
122 int num_values
= filter
.num_values();
123 for (int out_x
= 0; out_x
< num_values
; out_x
++) {
124 // Get the filter that determines the current output pixel.
125 int filter_offset
, filter_length
;
126 const ConvolutionFilter1D::Fixed
* filter_values
=
127 filter
.FilterForValue(out_x
, &filter_offset
, &filter_length
);
129 // Compute the first pixel in this row that the filter affects. It will
130 // touch |filter_length| pixels (4 bytes each) after this.
131 const unsigned char* row_to_filter
= &src_data
[filter_offset
* 4];
133 // Apply the filter to the row to get the destination pixel in |accum|.
135 for (int filter_x
= 0; filter_x
< filter_length
; filter_x
++) {
136 ConvolutionFilter1D::Fixed cur_filter
= filter_values
[filter_x
];
137 accum
[0] += cur_filter
* row_to_filter
[filter_x
* 4 + 0];
138 accum
[1] += cur_filter
* row_to_filter
[filter_x
* 4 + 1];
139 accum
[2] += cur_filter
* row_to_filter
[filter_x
* 4 + 2];
141 accum
[3] += cur_filter
* row_to_filter
[filter_x
* 4 + 3];
144 // Bring this value back in range. All of the filter scaling factors
145 // are in fixed point with kShiftBits bits of fractional part.
146 accum
[0] >>= ConvolutionFilter1D::kShiftBits
;
147 accum
[1] >>= ConvolutionFilter1D::kShiftBits
;
148 accum
[2] >>= ConvolutionFilter1D::kShiftBits
;
150 accum
[3] >>= ConvolutionFilter1D::kShiftBits
;
152 // Store the new pixel.
153 out_row
[out_x
* 4 + 0] = ClampTo8(accum
[0]);
154 out_row
[out_x
* 4 + 1] = ClampTo8(accum
[1]);
155 out_row
[out_x
* 4 + 2] = ClampTo8(accum
[2]);
157 out_row
[out_x
* 4 + 3] = ClampTo8(accum
[3]);
161 // Does vertical convolution to produce one output row. The filter values and
162 // length are given in the first two parameters. These are applied to each
163 // of the rows pointed to in the |source_data_rows| array, with each row
164 // being |pixel_width| wide.
166 // The output must have room for |pixel_width * 4| bytes.
167 template<bool has_alpha
>
168 void ConvolveVertically(const ConvolutionFilter1D::Fixed
* filter_values
,
170 unsigned char* const* source_data_rows
,
172 unsigned char* out_row
) {
173 // We go through each column in the output and do a vertical convolution,
174 // generating one output pixel each time.
175 for (int out_x
= 0; out_x
< pixel_width
; out_x
++) {
176 // Compute the number of bytes over in each row that the current column
177 // we're convolving starts at. The pixel will cover the next 4 bytes.
178 int byte_offset
= out_x
* 4;
180 // Apply the filter to one column of pixels.
182 for (int filter_y
= 0; filter_y
< filter_length
; filter_y
++) {
183 ConvolutionFilter1D::Fixed cur_filter
= filter_values
[filter_y
];
184 accum
[0] += cur_filter
* source_data_rows
[filter_y
][byte_offset
+ 0];
185 accum
[1] += cur_filter
* source_data_rows
[filter_y
][byte_offset
+ 1];
186 accum
[2] += cur_filter
* source_data_rows
[filter_y
][byte_offset
+ 2];
188 accum
[3] += cur_filter
* source_data_rows
[filter_y
][byte_offset
+ 3];
191 // Bring this value back in range. All of the filter scaling factors
192 // are in fixed point with kShiftBits bits of precision.
193 accum
[0] >>= ConvolutionFilter1D::kShiftBits
;
194 accum
[1] >>= ConvolutionFilter1D::kShiftBits
;
195 accum
[2] >>= ConvolutionFilter1D::kShiftBits
;
197 accum
[3] >>= ConvolutionFilter1D::kShiftBits
;
199 // Store the new pixel.
200 out_row
[byte_offset
+ 0] = ClampTo8(accum
[0]);
201 out_row
[byte_offset
+ 1] = ClampTo8(accum
[1]);
202 out_row
[byte_offset
+ 2] = ClampTo8(accum
[2]);
204 unsigned char alpha
= ClampTo8(accum
[3]);
206 // Make sure the alpha channel doesn't come out smaller than any of the
207 // color channels. We use premultipled alpha channels, so this should
208 // never happen, but rounding errors will cause this from time to time.
209 // These "impossible" colors will cause overflows (and hence random pixel
210 // values) when the resulting bitmap is drawn to the screen.
212 // We only need to do this when generating the final output row (here).
213 int max_color_channel
= std::max(out_row
[byte_offset
+ 0],
214 std::max(out_row
[byte_offset
+ 1], out_row
[byte_offset
+ 2]));
215 if (alpha
< max_color_channel
)
216 out_row
[byte_offset
+ 3] = max_color_channel
;
218 out_row
[byte_offset
+ 3] = alpha
;
220 // No alpha channel, the image is opaque.
221 out_row
[byte_offset
+ 3] = 0xff;
227 // Convolves horizontally along a single row. The row data is given in
228 // |src_data| and continues for the num_values() of the filter.
229 void ConvolveHorizontally_SSE2(const unsigned char* src_data
,
230 const ConvolutionFilter1D
& filter
,
231 unsigned char* out_row
) {
232 #if defined(SIMD_SSE2)
233 int num_values
= filter
.num_values();
235 int filter_offset
, filter_length
;
236 __m128i zero
= _mm_setzero_si128();
238 // |mask| will be used to decimate all extra filter coefficients that are
239 // loaded by SIMD when |filter_length| is not divisible by 4.
240 // mask[0] is not used in following algorithm.
241 mask
[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
242 mask
[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
243 mask
[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
245 // Output one pixel each iteration, calculating all channels (RGBA) together.
246 for (int out_x
= 0; out_x
< num_values
; out_x
++) {
247 const ConvolutionFilter1D::Fixed
* filter_values
=
248 filter
.FilterForValue(out_x
, &filter_offset
, &filter_length
);
250 __m128i accum
= _mm_setzero_si128();
252 // Compute the first pixel in this row that the filter affects. It will
253 // touch |filter_length| pixels (4 bytes each) after this.
254 const __m128i
* row_to_filter
=
255 reinterpret_cast<const __m128i
*>(&src_data
[filter_offset
<< 2]);
257 // We will load and accumulate with four coefficients per iteration.
258 for (int filter_x
= 0; filter_x
< filter_length
>> 2; filter_x
++) {
260 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
261 __m128i coeff
, coeff16
;
262 // [16] xx xx xx xx c3 c2 c1 c0
263 coeff
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(filter_values
));
264 // [16] xx xx xx xx c1 c1 c0 c0
265 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(1, 1, 0, 0));
266 // [16] c1 c1 c1 c1 c0 c0 c0 c0
267 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
269 // Load four pixels => unpack the first two pixels to 16 bits =>
270 // multiply with coefficients => accumulate the convolution result.
271 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
272 __m128i src8
= _mm_loadu_si128(row_to_filter
);
273 // [16] a1 b1 g1 r1 a0 b0 g0 r0
274 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
275 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
276 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
277 // [32] a0*c0 b0*c0 g0*c0 r0*c0
278 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
279 accum
= _mm_add_epi32(accum
, t
);
280 // [32] a1*c1 b1*c1 g1*c1 r1*c1
281 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
282 accum
= _mm_add_epi32(accum
, t
);
284 // Duplicate 3rd and 4th coefficients for all channels =>
285 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
286 // => accumulate the convolution results.
287 // [16] xx xx xx xx c3 c3 c2 c2
288 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(3, 3, 2, 2));
289 // [16] c3 c3 c3 c3 c2 c2 c2 c2
290 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
291 // [16] a3 g3 b3 r3 a2 g2 b2 r2
292 src16
= _mm_unpackhi_epi8(src8
, zero
);
293 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
294 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
295 // [32] a2*c2 b2*c2 g2*c2 r2*c2
296 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
297 accum
= _mm_add_epi32(accum
, t
);
298 // [32] a3*c3 b3*c3 g3*c3 r3*c3
299 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
300 accum
= _mm_add_epi32(accum
, t
);
302 // Advance the pixel and coefficients pointers.
307 // When |filter_length| is not divisible by 4, we need to decimate some of
308 // the filter coefficient that was loaded incorrectly to zero; Other than
309 // that the algorithm is same with above, exceot that the 4th pixel will be
311 int r
= filter_length
&3;
313 // Note: filter_values must be padded to align_up(filter_offset, 8).
314 __m128i coeff
, coeff16
;
315 coeff
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(filter_values
));
316 // Mask out extra filter taps.
317 coeff
= _mm_and_si128(coeff
, mask
[r
]);
318 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(1, 1, 0, 0));
319 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
321 // Note: line buffer must be padded to align_up(filter_offset, 16).
322 // We resolve this by use C-version for the last horizontal line.
323 __m128i src8
= _mm_loadu_si128(row_to_filter
);
324 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
325 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
326 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
327 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
328 accum
= _mm_add_epi32(accum
, t
);
329 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
330 accum
= _mm_add_epi32(accum
, t
);
332 src16
= _mm_unpackhi_epi8(src8
, zero
);
333 coeff16
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(3, 3, 2, 2));
334 coeff16
= _mm_unpacklo_epi16(coeff16
, coeff16
);
335 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
336 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
337 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
338 accum
= _mm_add_epi32(accum
, t
);
341 // Shift right for fixed point implementation.
342 accum
= _mm_srai_epi32(accum
, ConvolutionFilter1D::kShiftBits
);
344 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
345 accum
= _mm_packs_epi32(accum
, zero
);
346 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
347 accum
= _mm_packus_epi16(accum
, zero
);
349 // Store the pixel value of 32 bits.
350 *(reinterpret_cast<int*>(out_row
)) = _mm_cvtsi128_si32(accum
);
356 // Convolves horizontally along four rows. The row data is given in
357 // |src_data| and continues for the num_values() of the filter.
358 // The algorithm is almost same as |ConvolveHorizontally_SSE2|. Please
359 // refer to that function for detailed comments.
360 void ConvolveHorizontally4_SSE2(const unsigned char* src_data
[4],
361 const ConvolutionFilter1D
& filter
,
362 unsigned char* out_row
[4]) {
363 #if defined(SIMD_SSE2)
364 int num_values
= filter
.num_values();
366 int filter_offset
, filter_length
;
367 __m128i zero
= _mm_setzero_si128();
369 // |mask| will be used to decimate all extra filter coefficients that are
370 // loaded by SIMD when |filter_length| is not divisible by 4.
371 // mask[0] is not used in following algorithm.
372 mask
[1] = _mm_set_epi16(0, 0, 0, 0, 0, 0, 0, -1);
373 mask
[2] = _mm_set_epi16(0, 0, 0, 0, 0, 0, -1, -1);
374 mask
[3] = _mm_set_epi16(0, 0, 0, 0, 0, -1, -1, -1);
376 // Output one pixel each iteration, calculating all channels (RGBA) together.
377 for (int out_x
= 0; out_x
< num_values
; out_x
++) {
378 const ConvolutionFilter1D::Fixed
* filter_values
=
379 filter
.FilterForValue(out_x
, &filter_offset
, &filter_length
);
381 // four pixels in a column per iteration.
382 __m128i accum0
= _mm_setzero_si128();
383 __m128i accum1
= _mm_setzero_si128();
384 __m128i accum2
= _mm_setzero_si128();
385 __m128i accum3
= _mm_setzero_si128();
386 int start
= (filter_offset
<<2);
387 // We will load and accumulate with four coefficients per iteration.
388 for (int filter_x
= 0; filter_x
< (filter_length
>> 2); filter_x
++) {
389 __m128i coeff
, coeff16lo
, coeff16hi
;
390 // [16] xx xx xx xx c3 c2 c1 c0
391 coeff
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(filter_values
));
392 // [16] xx xx xx xx c1 c1 c0 c0
393 coeff16lo
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(1, 1, 0, 0));
394 // [16] c1 c1 c1 c1 c0 c0 c0 c0
395 coeff16lo
= _mm_unpacklo_epi16(coeff16lo
, coeff16lo
);
396 // [16] xx xx xx xx c3 c3 c2 c2
397 coeff16hi
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(3, 3, 2, 2));
398 // [16] c3 c3 c3 c3 c2 c2 c2 c2
399 coeff16hi
= _mm_unpacklo_epi16(coeff16hi
, coeff16hi
);
401 __m128i src8
, src16
, mul_hi
, mul_lo
, t
;
403 #define ITERATION(src, accum) \
404 src8 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); \
405 src16 = _mm_unpacklo_epi8(src8, zero); \
406 mul_hi = _mm_mulhi_epi16(src16, coeff16lo); \
407 mul_lo = _mm_mullo_epi16(src16, coeff16lo); \
408 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
409 accum = _mm_add_epi32(accum, t); \
410 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
411 accum = _mm_add_epi32(accum, t); \
412 src16 = _mm_unpackhi_epi8(src8, zero); \
413 mul_hi = _mm_mulhi_epi16(src16, coeff16hi); \
414 mul_lo = _mm_mullo_epi16(src16, coeff16hi); \
415 t = _mm_unpacklo_epi16(mul_lo, mul_hi); \
416 accum = _mm_add_epi32(accum, t); \
417 t = _mm_unpackhi_epi16(mul_lo, mul_hi); \
418 accum = _mm_add_epi32(accum, t)
420 ITERATION(src_data
[0] + start
, accum0
);
421 ITERATION(src_data
[1] + start
, accum1
);
422 ITERATION(src_data
[2] + start
, accum2
);
423 ITERATION(src_data
[3] + start
, accum3
);
429 int r
= filter_length
& 3;
431 // Note: filter_values must be padded to align_up(filter_offset, 8);
433 coeff
= _mm_loadl_epi64(reinterpret_cast<const __m128i
*>(filter_values
));
434 // Mask out extra filter taps.
435 coeff
= _mm_and_si128(coeff
, mask
[r
]);
437 __m128i coeff16lo
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(1, 1, 0, 0));
438 /* c1 c1 c1 c1 c0 c0 c0 c0 */
439 coeff16lo
= _mm_unpacklo_epi16(coeff16lo
, coeff16lo
);
440 __m128i coeff16hi
= _mm_shufflelo_epi16(coeff
, _MM_SHUFFLE(3, 3, 2, 2));
441 coeff16hi
= _mm_unpacklo_epi16(coeff16hi
, coeff16hi
);
443 __m128i src8
, src16
, mul_hi
, mul_lo
, t
;
445 ITERATION(src_data
[0] + start
, accum0
);
446 ITERATION(src_data
[1] + start
, accum1
);
447 ITERATION(src_data
[2] + start
, accum2
);
448 ITERATION(src_data
[3] + start
, accum3
);
451 accum0
= _mm_srai_epi32(accum0
, ConvolutionFilter1D::kShiftBits
);
452 accum0
= _mm_packs_epi32(accum0
, zero
);
453 accum0
= _mm_packus_epi16(accum0
, zero
);
454 accum1
= _mm_srai_epi32(accum1
, ConvolutionFilter1D::kShiftBits
);
455 accum1
= _mm_packs_epi32(accum1
, zero
);
456 accum1
= _mm_packus_epi16(accum1
, zero
);
457 accum2
= _mm_srai_epi32(accum2
, ConvolutionFilter1D::kShiftBits
);
458 accum2
= _mm_packs_epi32(accum2
, zero
);
459 accum2
= _mm_packus_epi16(accum2
, zero
);
460 accum3
= _mm_srai_epi32(accum3
, ConvolutionFilter1D::kShiftBits
);
461 accum3
= _mm_packs_epi32(accum3
, zero
);
462 accum3
= _mm_packus_epi16(accum3
, zero
);
464 *(reinterpret_cast<int*>(out_row
[0])) = _mm_cvtsi128_si32(accum0
);
465 *(reinterpret_cast<int*>(out_row
[1])) = _mm_cvtsi128_si32(accum1
);
466 *(reinterpret_cast<int*>(out_row
[2])) = _mm_cvtsi128_si32(accum2
);
467 *(reinterpret_cast<int*>(out_row
[3])) = _mm_cvtsi128_si32(accum3
);
477 // Does vertical convolution to produce one output row. The filter values and
478 // length are given in the first two parameters. These are applied to each
479 // of the rows pointed to in the |source_data_rows| array, with each row
480 // being |pixel_width| wide.
482 // The output must have room for |pixel_width * 4| bytes.
483 template<bool has_alpha
>
484 void ConvolveVertically_SSE2(const ConvolutionFilter1D::Fixed
* filter_values
,
486 unsigned char* const* source_data_rows
,
488 unsigned char* out_row
) {
489 #if defined(SIMD_SSE2)
490 int width
= pixel_width
& ~3;
492 __m128i zero
= _mm_setzero_si128();
493 __m128i accum0
, accum1
, accum2
, accum3
, coeff16
;
495 // Output four pixels per iteration (16 bytes).
496 for (int out_x
= 0; out_x
< width
; out_x
+= 4) {
498 // Accumulated result for each pixel. 32 bits per RGBA channel.
499 accum0
= _mm_setzero_si128();
500 accum1
= _mm_setzero_si128();
501 accum2
= _mm_setzero_si128();
502 accum3
= _mm_setzero_si128();
504 // Convolve with one filter coefficient per iteration.
505 for (int filter_y
= 0; filter_y
< filter_length
; filter_y
++) {
507 // Duplicate the filter coefficient 8 times.
508 // [16] cj cj cj cj cj cj cj cj
509 coeff16
= _mm_set1_epi16(filter_values
[filter_y
]);
511 // Load four pixels (16 bytes) together.
512 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
513 src
= reinterpret_cast<const __m128i
*>(
514 &source_data_rows
[filter_y
][out_x
<< 2]);
515 __m128i src8
= _mm_loadu_si128(src
);
517 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
518 // multiply with current coefficient => accumulate the result.
519 // [16] a1 b1 g1 r1 a0 b0 g0 r0
520 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
521 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
522 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
524 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
525 accum0
= _mm_add_epi32(accum0
, t
);
527 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
528 accum1
= _mm_add_epi32(accum1
, t
);
530 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
531 // multiply with current coefficient => accumulate the result.
532 // [16] a3 b3 g3 r3 a2 b2 g2 r2
533 src16
= _mm_unpackhi_epi8(src8
, zero
);
534 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
535 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
537 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
538 accum2
= _mm_add_epi32(accum2
, t
);
540 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
541 accum3
= _mm_add_epi32(accum3
, t
);
544 // Shift right for fixed point implementation.
545 accum0
= _mm_srai_epi32(accum0
, ConvolutionFilter1D::kShiftBits
);
546 accum1
= _mm_srai_epi32(accum1
, ConvolutionFilter1D::kShiftBits
);
547 accum2
= _mm_srai_epi32(accum2
, ConvolutionFilter1D::kShiftBits
);
548 accum3
= _mm_srai_epi32(accum3
, ConvolutionFilter1D::kShiftBits
);
550 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
551 // [16] a1 b1 g1 r1 a0 b0 g0 r0
552 accum0
= _mm_packs_epi32(accum0
, accum1
);
553 // [16] a3 b3 g3 r3 a2 b2 g2 r2
554 accum2
= _mm_packs_epi32(accum2
, accum3
);
556 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
557 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
558 accum0
= _mm_packus_epi16(accum0
, accum2
);
561 // Compute the max(ri, gi, bi) for each pixel.
562 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
563 __m128i a
= _mm_srli_epi32(accum0
, 8);
564 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
565 __m128i b
= _mm_max_epu8(a
, accum0
); // Max of r and g.
566 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
567 a
= _mm_srli_epi32(accum0
, 16);
568 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
569 b
= _mm_max_epu8(a
, b
); // Max of r and g and b.
570 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
571 b
= _mm_slli_epi32(b
, 24);
573 // Make sure the value of alpha channel is always larger than maximum
574 // value of color channels.
575 accum0
= _mm_max_epu8(b
, accum0
);
577 // Set value of alpha channels to 0xFF.
578 __m128i mask
= _mm_set1_epi32(0xff000000);
579 accum0
= _mm_or_si128(accum0
, mask
);
582 // Store the convolution result (16 bytes) and advance the pixel pointers.
583 _mm_storeu_si128(reinterpret_cast<__m128i
*>(out_row
), accum0
);
587 // When the width of the output is not divisible by 4, We need to save one
588 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
589 if (pixel_width
& 3) {
590 accum0
= _mm_setzero_si128();
591 accum1
= _mm_setzero_si128();
592 accum2
= _mm_setzero_si128();
593 for (int filter_y
= 0; filter_y
< filter_length
; ++filter_y
) {
594 coeff16
= _mm_set1_epi16(filter_values
[filter_y
]);
595 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
596 src
= reinterpret_cast<const __m128i
*>(
597 &source_data_rows
[filter_y
][width
<<2]);
598 __m128i src8
= _mm_loadu_si128(src
);
599 // [16] a1 b1 g1 r1 a0 b0 g0 r0
600 __m128i src16
= _mm_unpacklo_epi8(src8
, zero
);
601 __m128i mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
602 __m128i mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
604 __m128i t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
605 accum0
= _mm_add_epi32(accum0
, t
);
607 t
= _mm_unpackhi_epi16(mul_lo
, mul_hi
);
608 accum1
= _mm_add_epi32(accum1
, t
);
609 // [16] a3 b3 g3 r3 a2 b2 g2 r2
610 src16
= _mm_unpackhi_epi8(src8
, zero
);
611 mul_hi
= _mm_mulhi_epi16(src16
, coeff16
);
612 mul_lo
= _mm_mullo_epi16(src16
, coeff16
);
614 t
= _mm_unpacklo_epi16(mul_lo
, mul_hi
);
615 accum2
= _mm_add_epi32(accum2
, t
);
618 accum0
= _mm_srai_epi32(accum0
, ConvolutionFilter1D::kShiftBits
);
619 accum1
= _mm_srai_epi32(accum1
, ConvolutionFilter1D::kShiftBits
);
620 accum2
= _mm_srai_epi32(accum2
, ConvolutionFilter1D::kShiftBits
);
621 // [16] a1 b1 g1 r1 a0 b0 g0 r0
622 accum0
= _mm_packs_epi32(accum0
, accum1
);
623 // [16] a3 b3 g3 r3 a2 b2 g2 r2
624 accum2
= _mm_packs_epi32(accum2
, zero
);
625 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
626 accum0
= _mm_packus_epi16(accum0
, accum2
);
628 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
629 __m128i a
= _mm_srli_epi32(accum0
, 8);
630 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
631 __m128i b
= _mm_max_epu8(a
, accum0
); // Max of r and g.
632 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
633 a
= _mm_srli_epi32(accum0
, 16);
634 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
635 b
= _mm_max_epu8(a
, b
); // Max of r and g and b.
636 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
637 b
= _mm_slli_epi32(b
, 24);
638 accum0
= _mm_max_epu8(b
, accum0
);
640 __m128i mask
= _mm_set1_epi32(0xff000000);
641 accum0
= _mm_or_si128(accum0
, mask
);
644 for (int out_x
= width
; out_x
< pixel_width
; out_x
++) {
645 *(reinterpret_cast<int*>(out_row
)) = _mm_cvtsi128_si32(accum0
);
646 accum0
= _mm_srli_si128(accum0
, 4);
655 // ConvolutionFilter1D ---------------------------------------------------------
657 ConvolutionFilter1D::ConvolutionFilter1D()
661 ConvolutionFilter1D::~ConvolutionFilter1D() {
664 void ConvolutionFilter1D::AddFilter(int filter_offset
,
665 const float* filter_values
,
667 SkASSERT(filter_length
> 0);
669 std::vector
<Fixed
> fixed_values
;
670 fixed_values
.reserve(filter_length
);
672 for (int i
= 0; i
< filter_length
; ++i
)
673 fixed_values
.push_back(FloatToFixed(filter_values
[i
]));
675 AddFilter(filter_offset
, &fixed_values
[0], filter_length
);
678 void ConvolutionFilter1D::AddFilter(int filter_offset
,
679 const Fixed
* filter_values
,
681 // It is common for leading/trailing filter values to be zeros. In such
682 // cases it is beneficial to only store the central factors.
683 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
684 // a 1080p image this optimization gives a ~10% speed improvement.
685 int first_non_zero
= 0;
686 while (first_non_zero
< filter_length
&& filter_values
[first_non_zero
] == 0)
689 if (first_non_zero
< filter_length
) {
690 // Here we have at least one non-zero factor.
691 int last_non_zero
= filter_length
- 1;
692 while (last_non_zero
>= 0 && filter_values
[last_non_zero
] == 0)
695 filter_offset
+= first_non_zero
;
696 filter_length
= last_non_zero
+ 1 - first_non_zero
;
697 SkASSERT(filter_length
> 0);
699 for (int i
= first_non_zero
; i
<= last_non_zero
; i
++)
700 filter_values_
.push_back(filter_values
[i
]);
702 // Here all the factors were zeroes.
706 FilterInstance instance
;
708 // We pushed filter_length elements onto filter_values_
709 instance
.data_location
= (static_cast<int>(filter_values_
.size()) -
711 instance
.offset
= filter_offset
;
712 instance
.length
= filter_length
;
713 filters_
.push_back(instance
);
715 max_filter_
= std::max(max_filter_
, filter_length
);
718 void BGRAConvolve2D(const unsigned char* source_data
,
719 int source_byte_row_stride
,
720 bool source_has_alpha
,
721 const ConvolutionFilter1D
& filter_x
,
722 const ConvolutionFilter1D
& filter_y
,
723 int output_byte_row_stride
,
724 unsigned char* output
,
726 #if !defined(SIMD_SSE2)
727 // Even we have runtime support for SSE2 instructions, since the binary
728 // was not built with SSE2 support, we had to fallback to C version.
732 int max_y_filter_size
= filter_y
.max_filter();
734 // The next row in the input that we will generate a horizontally
735 // convolved row for. If the filter doesn't start at the beginning of the
736 // image (this is the case when we are only resizing a subset), then we
737 // don't want to generate any output rows before that. Compute the starting
738 // row for convolution as the first pixel for the first vertical filter.
739 int filter_offset
, filter_length
;
740 const ConvolutionFilter1D::Fixed
* filter_values
=
741 filter_y
.FilterForValue(0, &filter_offset
, &filter_length
);
742 int next_x_row
= filter_offset
;
744 // We loop over each row in the input doing a horizontal convolution. This
745 // will result in a horizontally convolved image. We write the results into
746 // a circular buffer of convolved rows and do vertical convolution as rows
747 // are available. This prevents us from having to store the entire
748 // intermediate image and helps cache coherency.
749 // We will need four extra rows to allow horizontal convolution could be done
750 // simultaneously. We also padding each row in row buffer to be aligned-up to
752 // TODO(jiesun): We do not use aligned load from row buffer in vertical
753 // convolution pass yet. Somehow Windows does not like it.
754 int row_buffer_width
= (filter_x
.num_values() + 15) & ~0xF;
755 int row_buffer_height
= max_y_filter_size
+ (use_sse2
? 4 : 0);
756 CircularRowBuffer
row_buffer(row_buffer_width
,
760 // Loop over every possible output row, processing just enough horizontal
761 // convolutions to run each subsequent vertical convolution.
762 SkASSERT(output_byte_row_stride
>= filter_x
.num_values() * 4);
763 int num_output_rows
= filter_y
.num_values();
765 // We need to check which is the last line to convolve before we advance 4
766 // lines in one iteration.
767 int last_filter_offset
, last_filter_length
;
768 filter_y
.FilterForValue(num_output_rows
- 1, &last_filter_offset
,
769 &last_filter_length
);
771 for (int out_y
= 0; out_y
< num_output_rows
; out_y
++) {
772 filter_values
= filter_y
.FilterForValue(out_y
,
773 &filter_offset
, &filter_length
);
775 // Generate output rows until we have enough to run the current filter.
777 while (next_x_row
< filter_offset
+ filter_length
) {
778 if (next_x_row
+ 3 < last_filter_offset
+ last_filter_length
- 1) {
779 const unsigned char* src
[4];
780 unsigned char* out_row
[4];
781 for (int i
= 0; i
< 4; ++i
) {
782 src
[i
] = &source_data
[(next_x_row
+ i
) * source_byte_row_stride
];
783 out_row
[i
] = row_buffer
.AdvanceRow();
785 ConvolveHorizontally4_SSE2(src
, filter_x
, out_row
);
788 // For the last row, SSE2 load possibly to access data beyond the
789 // image area. therefore we use C version here.
790 if (next_x_row
== last_filter_offset
+ last_filter_length
- 1) {
791 if (source_has_alpha
) {
792 ConvolveHorizontally
<true>(
793 &source_data
[next_x_row
* source_byte_row_stride
],
794 filter_x
, row_buffer
.AdvanceRow());
796 ConvolveHorizontally
<false>(
797 &source_data
[next_x_row
* source_byte_row_stride
],
798 filter_x
, row_buffer
.AdvanceRow());
801 ConvolveHorizontally_SSE2(
802 &source_data
[next_x_row
* source_byte_row_stride
],
803 filter_x
, row_buffer
.AdvanceRow());
809 while (next_x_row
< filter_offset
+ filter_length
) {
810 if (source_has_alpha
) {
811 ConvolveHorizontally
<true>(
812 &source_data
[next_x_row
* source_byte_row_stride
],
813 filter_x
, row_buffer
.AdvanceRow());
815 ConvolveHorizontally
<false>(
816 &source_data
[next_x_row
* source_byte_row_stride
],
817 filter_x
, row_buffer
.AdvanceRow());
823 // Compute where in the output image this row of final data will go.
824 unsigned char* cur_output_row
= &output
[out_y
* output_byte_row_stride
];
826 // Get the list of rows that the circular buffer has, in order.
827 int first_row_in_circular_buffer
;
828 unsigned char* const* rows_to_convolve
=
829 row_buffer
.GetRowAddresses(&first_row_in_circular_buffer
);
831 // Now compute the start of the subset of those rows that the filter
833 unsigned char* const* first_row_for_filter
=
834 &rows_to_convolve
[filter_offset
- first_row_in_circular_buffer
];
836 if (source_has_alpha
) {
838 ConvolveVertically_SSE2
<true>(filter_values
, filter_length
,
839 first_row_for_filter
,
840 filter_x
.num_values(), cur_output_row
);
842 ConvolveVertically
<true>(filter_values
, filter_length
,
843 first_row_for_filter
,
844 filter_x
.num_values(), cur_output_row
);
848 ConvolveVertically_SSE2
<false>(filter_values
, filter_length
,
849 first_row_for_filter
,
850 filter_x
.num_values(), cur_output_row
);
852 ConvolveVertically
<false>(filter_values
, filter_length
,
853 first_row_for_filter
,
854 filter_x
.num_values(), cur_output_row
);