Bug 1890689 accumulate input in LargerReceiverBlockSizeThanDesiredBuffering GTest...
[gecko.git] / gfx / 2d / ConvolutionFilterNEON.cpp
blob9983a0681a971d1dc1f103d11c2f5bad9772c8f2
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
8 #include "mozilla/Attributes.h"
9 #include <arm_neon.h>
11 namespace skia {
13 static MOZ_ALWAYS_INLINE void AccumRemainder(
14 const unsigned char* pixelsLeft,
15 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
16 int32x4_t& accum, int r) {
17 int remainder[4] = {0};
18 for (int i = 0; i < r; i++) {
19 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
20 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
21 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
22 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
23 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
25 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]};
26 accum += t;
29 // Convolves horizontally along a single row. The row data is given in
30 // |srcData| and continues for the numValues() of the filter.
31 void convolve_horizontally_neon(const unsigned char* srcData,
32 const SkConvolutionFilter1D& filter,
33 unsigned char* outRow, bool /*hasAlpha*/) {
34 // Loop over each pixel on this row in the output image.
35 int numValues = filter.numValues();
36 for (int outX = 0; outX < numValues; outX++) {
37 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100);
38 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302);
39 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504);
40 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706);
41 // Get the filter that determines the current output pixel.
42 int filterOffset, filterLength;
43 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
44 filter.FilterForValue(outX, &filterOffset, &filterLength);
46 // Compute the first pixel in this row that the filter affects. It will
47 // touch |filterLength| pixels (4 bytes each) after this.
48 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
50 // Apply the filter to the row to get the destination pixel in |accum|.
51 int32x4_t accum = vdupq_n_s32(0);
52 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
53 // Load 4 coefficients
54 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3;
55 coeffs = vld1_s16(filterValues);
56 coeff0 = vreinterpret_s16_u8(
57 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0));
58 coeff1 = vreinterpret_s16_u8(
59 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1));
60 coeff2 = vreinterpret_s16_u8(
61 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2));
62 coeff3 = vreinterpret_s16_u8(
63 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3));
65 // Load pixels and calc
66 uint8x16_t pixels = vld1q_u8(rowToFilter);
67 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels)));
68 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels)));
70 int16x4_t p0_src = vget_low_s16(p01_16);
71 int16x4_t p1_src = vget_high_s16(p01_16);
72 int16x4_t p2_src = vget_low_s16(p23_16);
73 int16x4_t p3_src = vget_high_s16(p23_16);
75 int32x4_t p0 = vmull_s16(p0_src, coeff0);
76 int32x4_t p1 = vmull_s16(p1_src, coeff1);
77 int32x4_t p2 = vmull_s16(p2_src, coeff2);
78 int32x4_t p3 = vmull_s16(p3_src, coeff3);
80 accum += p0;
81 accum += p1;
82 accum += p2;
83 accum += p3;
85 // Advance the pointers
86 rowToFilter += 16;
87 filterValues += 4;
90 int r = filterLength & 3;
91 if (r) {
92 int remainder_offset = (filterOffset + filterLength - r) * 4;
93 AccumRemainder(srcData + remainder_offset, filterValues, accum, r);
96 // Bring this value back in range. All of the filter scaling factors
97 // are in fixed point with kShiftBits bits of fractional part.
98 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits);
100 // Pack and store the new pixel.
101 int16x4_t accum16 = vqmovn_s32(accum);
102 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16));
103 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow),
104 vreinterpret_u32_u8(accum8), 0);
105 outRow += 4;
109 // Does vertical convolution to produce one output row. The filter values and
110 // length are given in the first two parameters. These are applied to each
111 // of the rows pointed to in the |sourceDataRows| array, with each row
112 // being |pixelWidth| wide.
114 // The output must have room for |pixelWidth * 4| bytes.
115 template <bool hasAlpha>
116 static void ConvolveVertically(
117 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
118 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
119 unsigned char* outRow) {
120 int width = pixelWidth & ~3;
122 // Output four pixels per iteration (16 bytes).
123 for (int outX = 0; outX < width; outX += 4) {
124 // Accumulated result for each pixel. 32 bits per RGBA channel.
125 int32x4_t accum0 = vdupq_n_s32(0);
126 int32x4_t accum1 = vdupq_n_s32(0);
127 int32x4_t accum2 = vdupq_n_s32(0);
128 int32x4_t accum3 = vdupq_n_s32(0);
130 // Convolve with one filter coefficient per iteration.
131 for (int filterY = 0; filterY < filterLength; filterY++) {
132 // Duplicate the filter coefficient 4 times.
133 // [16] cj cj cj cj
134 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
136 // Load four pixels (16 bytes) together.
137 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
138 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]);
140 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
141 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
142 int16x4_t src16_0 = vget_low_s16(src16_01);
143 int16x4_t src16_1 = vget_high_s16(src16_01);
144 int16x4_t src16_2 = vget_low_s16(src16_23);
145 int16x4_t src16_3 = vget_high_s16(src16_23);
147 accum0 += vmull_s16(src16_0, coeff16);
148 accum1 += vmull_s16(src16_1, coeff16);
149 accum2 += vmull_s16(src16_2, coeff16);
150 accum3 += vmull_s16(src16_3, coeff16);
153 // Shift right for fixed point implementation.
154 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
155 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
156 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
157 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits);
159 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
160 // [16] a1 b1 g1 r1 a0 b0 g0 r0
161 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
162 // [16] a3 b3 g3 r3 a2 b2 g2 r2
163 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3));
165 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
166 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
167 uint8x16_t accum8 =
168 vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
170 if (hasAlpha) {
171 // Compute the max(ri, gi, bi) for each pixel.
172 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
173 uint8x16_t a =
174 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
175 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
176 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
177 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
178 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
179 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
180 b = vmaxq_u8(a, b); // Max of r and g and b.
181 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
182 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
184 // Make sure the value of alpha channel is always larger than maximum
185 // value of color channels.
186 accum8 = vmaxq_u8(b, accum8);
187 } else {
188 // Set value of alpha channels to 0xFF.
189 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) |
190 vdupq_n_u32(0xFF000000));
193 // Store the convolution result (16 bytes) and advance the pixel pointers.
194 vst1q_u8(outRow, accum8);
195 outRow += 16;
198 // Process the leftovers when the width of the output is not divisible
199 // by 4, that is at most 3 pixels.
200 int r = pixelWidth & 3;
201 if (r) {
202 int32x4_t accum0 = vdupq_n_s32(0);
203 int32x4_t accum1 = vdupq_n_s32(0);
204 int32x4_t accum2 = vdupq_n_s32(0);
206 for (int filterY = 0; filterY < filterLength; ++filterY) {
207 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]);
209 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
210 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]);
212 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8)));
213 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8)));
214 int16x4_t src16_0 = vget_low_s16(src16_01);
215 int16x4_t src16_1 = vget_high_s16(src16_01);
216 int16x4_t src16_2 = vget_low_s16(src16_23);
218 accum0 += vmull_s16(src16_0, coeff16);
219 accum1 += vmull_s16(src16_1, coeff16);
220 accum2 += vmull_s16(src16_2, coeff16);
223 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits);
224 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits);
225 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits);
227 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1));
228 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2));
230 uint8x16_t accum8 =
231 vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1));
233 if (hasAlpha) {
234 // Compute the max(ri, gi, bi) for each pixel.
235 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
236 uint8x16_t a =
237 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8));
238 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
239 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g
240 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
241 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16));
242 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
243 b = vmaxq_u8(a, b); // Max of r and g and b.
244 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
245 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24));
246 // Make sure the value of alpha channel is always larger than maximum
247 // value of color channels.
248 accum8 = vmaxq_u8(b, accum8);
249 } else {
250 // Set value of alpha channels to 0xFF.
251 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) |
252 vdupq_n_u32(0xFF000000));
255 switch (r) {
256 case 1:
257 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow),
258 vreinterpretq_u32_u8(accum8), 0);
259 break;
260 case 2:
261 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
262 vreinterpret_u32_u8(vget_low_u8(accum8)));
263 break;
264 case 3:
265 vst1_u32(reinterpret_cast<uint32_t*>(outRow),
266 vreinterpret_u32_u8(vget_low_u8(accum8)));
267 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow + 8),
268 vreinterpretq_u32_u8(accum8), 2);
269 break;
274 void convolve_vertically_neon(
275 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
276 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
277 unsigned char* outRow, bool hasAlpha) {
278 if (hasAlpha) {
279 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
280 pixelWidth, outRow);
281 } else {
282 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
283 pixelWidth, outRow);
287 } // namespace skia