1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
8 #include "mozilla/Attributes.h"
13 static MOZ_ALWAYS_INLINE
void AccumRemainder(
14 const unsigned char* pixelsLeft
,
15 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
16 int32x4_t
& accum
, int r
) {
17 int remainder
[4] = {0};
18 for (int i
= 0; i
< r
; i
++) {
19 SkConvolutionFilter1D::ConvolutionFixed coeff
= filterValues
[i
];
20 remainder
[0] += coeff
* pixelsLeft
[i
* 4 + 0];
21 remainder
[1] += coeff
* pixelsLeft
[i
* 4 + 1];
22 remainder
[2] += coeff
* pixelsLeft
[i
* 4 + 2];
23 remainder
[3] += coeff
* pixelsLeft
[i
* 4 + 3];
25 int32x4_t t
= {remainder
[0], remainder
[1], remainder
[2], remainder
[3]};
29 // Convolves horizontally along a single row. The row data is given in
30 // |srcData| and continues for the numValues() of the filter.
31 void convolve_horizontally_neon(const unsigned char* srcData
,
32 const SkConvolutionFilter1D
& filter
,
33 unsigned char* outRow
, bool /*hasAlpha*/) {
34 // Loop over each pixel on this row in the output image.
35 int numValues
= filter
.numValues();
36 for (int outX
= 0; outX
< numValues
; outX
++) {
37 uint8x8_t coeff_mask0
= vcreate_u8(0x0100010001000100);
38 uint8x8_t coeff_mask1
= vcreate_u8(0x0302030203020302);
39 uint8x8_t coeff_mask2
= vcreate_u8(0x0504050405040504);
40 uint8x8_t coeff_mask3
= vcreate_u8(0x0706070607060706);
41 // Get the filter that determines the current output pixel.
42 int filterOffset
, filterLength
;
43 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
=
44 filter
.FilterForValue(outX
, &filterOffset
, &filterLength
);
46 // Compute the first pixel in this row that the filter affects. It will
47 // touch |filterLength| pixels (4 bytes each) after this.
48 const unsigned char* rowToFilter
= &srcData
[filterOffset
* 4];
50 // Apply the filter to the row to get the destination pixel in |accum|.
51 int32x4_t accum
= vdupq_n_s32(0);
52 for (int filterX
= 0; filterX
< filterLength
>> 2; filterX
++) {
53 // Load 4 coefficients
54 int16x4_t coeffs
, coeff0
, coeff1
, coeff2
, coeff3
;
55 coeffs
= vld1_s16(filterValues
);
56 coeff0
= vreinterpret_s16_u8(
57 vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask0
));
58 coeff1
= vreinterpret_s16_u8(
59 vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask1
));
60 coeff2
= vreinterpret_s16_u8(
61 vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask2
));
62 coeff3
= vreinterpret_s16_u8(
63 vtbl1_u8(vreinterpret_u8_s16(coeffs
), coeff_mask3
));
65 // Load pixels and calc
66 uint8x16_t pixels
= vld1q_u8(rowToFilter
);
67 int16x8_t p01_16
= vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels
)));
68 int16x8_t p23_16
= vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels
)));
70 int16x4_t p0_src
= vget_low_s16(p01_16
);
71 int16x4_t p1_src
= vget_high_s16(p01_16
);
72 int16x4_t p2_src
= vget_low_s16(p23_16
);
73 int16x4_t p3_src
= vget_high_s16(p23_16
);
75 int32x4_t p0
= vmull_s16(p0_src
, coeff0
);
76 int32x4_t p1
= vmull_s16(p1_src
, coeff1
);
77 int32x4_t p2
= vmull_s16(p2_src
, coeff2
);
78 int32x4_t p3
= vmull_s16(p3_src
, coeff3
);
85 // Advance the pointers
90 int r
= filterLength
& 3;
92 int remainder_offset
= (filterOffset
+ filterLength
- r
) * 4;
93 AccumRemainder(srcData
+ remainder_offset
, filterValues
, accum
, r
);
96 // Bring this value back in range. All of the filter scaling factors
97 // are in fixed point with kShiftBits bits of fractional part.
98 accum
= vshrq_n_s32(accum
, SkConvolutionFilter1D::kShiftBits
);
100 // Pack and store the new pixel.
101 int16x4_t accum16
= vqmovn_s32(accum
);
102 uint8x8_t accum8
= vqmovun_s16(vcombine_s16(accum16
, accum16
));
103 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow
),
104 vreinterpret_u32_u8(accum8
), 0);
109 // Does vertical convolution to produce one output row. The filter values and
110 // length are given in the first two parameters. These are applied to each
111 // of the rows pointed to in the |sourceDataRows| array, with each row
112 // being |pixelWidth| wide.
114 // The output must have room for |pixelWidth * 4| bytes.
115 template <bool hasAlpha
>
116 static void ConvolveVertically(
117 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
118 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
119 unsigned char* outRow
) {
120 int width
= pixelWidth
& ~3;
122 // Output four pixels per iteration (16 bytes).
123 for (int outX
= 0; outX
< width
; outX
+= 4) {
124 // Accumulated result for each pixel. 32 bits per RGBA channel.
125 int32x4_t accum0
= vdupq_n_s32(0);
126 int32x4_t accum1
= vdupq_n_s32(0);
127 int32x4_t accum2
= vdupq_n_s32(0);
128 int32x4_t accum3
= vdupq_n_s32(0);
130 // Convolve with one filter coefficient per iteration.
131 for (int filterY
= 0; filterY
< filterLength
; filterY
++) {
132 // Duplicate the filter coefficient 4 times.
134 int16x4_t coeff16
= vdup_n_s16(filterValues
[filterY
]);
136 // Load four pixels (16 bytes) together.
137 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
138 uint8x16_t src8
= vld1q_u8(&sourceDataRows
[filterY
][outX
<< 2]);
140 int16x8_t src16_01
= vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8
)));
141 int16x8_t src16_23
= vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8
)));
142 int16x4_t src16_0
= vget_low_s16(src16_01
);
143 int16x4_t src16_1
= vget_high_s16(src16_01
);
144 int16x4_t src16_2
= vget_low_s16(src16_23
);
145 int16x4_t src16_3
= vget_high_s16(src16_23
);
147 accum0
+= vmull_s16(src16_0
, coeff16
);
148 accum1
+= vmull_s16(src16_1
, coeff16
);
149 accum2
+= vmull_s16(src16_2
, coeff16
);
150 accum3
+= vmull_s16(src16_3
, coeff16
);
153 // Shift right for fixed point implementation.
154 accum0
= vshrq_n_s32(accum0
, SkConvolutionFilter1D::kShiftBits
);
155 accum1
= vshrq_n_s32(accum1
, SkConvolutionFilter1D::kShiftBits
);
156 accum2
= vshrq_n_s32(accum2
, SkConvolutionFilter1D::kShiftBits
);
157 accum3
= vshrq_n_s32(accum3
, SkConvolutionFilter1D::kShiftBits
);
159 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
160 // [16] a1 b1 g1 r1 a0 b0 g0 r0
161 int16x8_t accum16_0
= vcombine_s16(vqmovn_s32(accum0
), vqmovn_s32(accum1
));
162 // [16] a3 b3 g3 r3 a2 b2 g2 r2
163 int16x8_t accum16_1
= vcombine_s16(vqmovn_s32(accum2
), vqmovn_s32(accum3
));
165 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
166 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
168 vcombine_u8(vqmovun_s16(accum16_0
), vqmovun_s16(accum16_1
));
171 // Compute the max(ri, gi, bi) for each pixel.
172 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
174 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 8));
175 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
176 uint8x16_t b
= vmaxq_u8(a
, accum8
); // Max of r and g
177 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
178 a
= vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 16));
179 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
180 b
= vmaxq_u8(a
, b
); // Max of r and g and b.
181 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
182 b
= vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b
), 24));
184 // Make sure the value of alpha channel is always larger than maximum
185 // value of color channels.
186 accum8
= vmaxq_u8(b
, accum8
);
188 // Set value of alpha channels to 0xFF.
189 accum8
= vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8
) |
190 vdupq_n_u32(0xFF000000));
193 // Store the convolution result (16 bytes) and advance the pixel pointers.
194 vst1q_u8(outRow
, accum8
);
198 // Process the leftovers when the width of the output is not divisible
199 // by 4, that is at most 3 pixels.
200 int r
= pixelWidth
& 3;
202 int32x4_t accum0
= vdupq_n_s32(0);
203 int32x4_t accum1
= vdupq_n_s32(0);
204 int32x4_t accum2
= vdupq_n_s32(0);
206 for (int filterY
= 0; filterY
< filterLength
; ++filterY
) {
207 int16x4_t coeff16
= vdup_n_s16(filterValues
[filterY
]);
209 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
210 uint8x16_t src8
= vld1q_u8(&sourceDataRows
[filterY
][width
<< 2]);
212 int16x8_t src16_01
= vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8
)));
213 int16x8_t src16_23
= vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8
)));
214 int16x4_t src16_0
= vget_low_s16(src16_01
);
215 int16x4_t src16_1
= vget_high_s16(src16_01
);
216 int16x4_t src16_2
= vget_low_s16(src16_23
);
218 accum0
+= vmull_s16(src16_0
, coeff16
);
219 accum1
+= vmull_s16(src16_1
, coeff16
);
220 accum2
+= vmull_s16(src16_2
, coeff16
);
223 accum0
= vshrq_n_s32(accum0
, SkConvolutionFilter1D::kShiftBits
);
224 accum1
= vshrq_n_s32(accum1
, SkConvolutionFilter1D::kShiftBits
);
225 accum2
= vshrq_n_s32(accum2
, SkConvolutionFilter1D::kShiftBits
);
227 int16x8_t accum16_0
= vcombine_s16(vqmovn_s32(accum0
), vqmovn_s32(accum1
));
228 int16x8_t accum16_1
= vcombine_s16(vqmovn_s32(accum2
), vqmovn_s32(accum2
));
231 vcombine_u8(vqmovun_s16(accum16_0
), vqmovun_s16(accum16_1
));
234 // Compute the max(ri, gi, bi) for each pixel.
235 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
237 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 8));
238 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
239 uint8x16_t b
= vmaxq_u8(a
, accum8
); // Max of r and g
240 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
241 a
= vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8
), 16));
242 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
243 b
= vmaxq_u8(a
, b
); // Max of r and g and b.
244 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
245 b
= vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b
), 24));
246 // Make sure the value of alpha channel is always larger than maximum
247 // value of color channels.
248 accum8
= vmaxq_u8(b
, accum8
);
250 // Set value of alpha channels to 0xFF.
251 accum8
= vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8
) |
252 vdupq_n_u32(0xFF000000));
257 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow
),
258 vreinterpretq_u32_u8(accum8
), 0);
261 vst1_u32(reinterpret_cast<uint32_t*>(outRow
),
262 vreinterpret_u32_u8(vget_low_u8(accum8
)));
265 vst1_u32(reinterpret_cast<uint32_t*>(outRow
),
266 vreinterpret_u32_u8(vget_low_u8(accum8
)));
267 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow
+ 8),
268 vreinterpretq_u32_u8(accum8
), 2);
274 void convolve_vertically_neon(
275 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
276 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
277 unsigned char* outRow
, bool hasAlpha
) {
279 ConvolveVertically
<true>(filterValues
, filterLength
, sourceDataRows
,
282 ConvolveVertically
<false>(filterValues
, filterLength
, sourceDataRows
,