Bug 1876335 - use GRADLE_MAVEN_REPOSITORIES in more places. r=owlish,geckoview-review...
[gecko.git] / gfx / 2d / ConvolutionFilterSSE2.cpp
blobc0aadb224549affbeb8f3ec614e87da30eb74eef
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
8 #include "mozilla/Attributes.h"
9 #include <immintrin.h>
11 namespace skia {
13 static MOZ_ALWAYS_INLINE void AccumRemainder(
14 const unsigned char* pixelsLeft,
15 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum,
16 int r) {
17 int remainder[4] = {0};
18 for (int i = 0; i < r; i++) {
19 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
20 remainder[0] += coeff * pixelsLeft[i * 4 + 0];
21 remainder[1] += coeff * pixelsLeft[i * 4 + 1];
22 remainder[2] += coeff * pixelsLeft[i * 4 + 2];
23 remainder[3] += coeff * pixelsLeft[i * 4 + 3];
25 __m128i t =
26 _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
27 accum = _mm_add_epi32(accum, t);
30 // Convolves horizontally along a single row. The row data is given in
31 // |srcData| and continues for the numValues() of the filter.
32 void convolve_horizontally_sse2(const unsigned char* srcData,
33 const SkConvolutionFilter1D& filter,
34 unsigned char* outRow, bool /*hasAlpha*/) {
35 // Output one pixel each iteration, calculating all channels (RGBA) together.
36 int numValues = filter.numValues();
37 for (int outX = 0; outX < numValues; outX++) {
38 // Get the filter that determines the current output pixel.
39 int filterOffset, filterLength;
40 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
41 filter.FilterForValue(outX, &filterOffset, &filterLength);
43 // Compute the first pixel in this row that the filter affects. It will
44 // touch |filterLength| pixels (4 bytes each) after this.
45 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
47 __m128i zero = _mm_setzero_si128();
48 __m128i accum = _mm_setzero_si128();
50 // We will load and accumulate with four coefficients per iteration.
51 for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
52 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
53 __m128i coeff, coeff16;
54 // [16] xx xx xx xx c3 c2 c1 c0
55 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
56 // [16] xx xx xx xx c1 c1 c0 c0
57 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
58 // [16] c1 c1 c1 c1 c0 c0 c0 c0
59 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
61 // Load four pixels => unpack the first two pixels to 16 bits =>
62 // multiply with coefficients => accumulate the convolution result.
63 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
64 __m128i src8 =
65 _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter));
66 // [16] a1 b1 g1 r1 a0 b0 g0 r0
67 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
68 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
69 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
70 // [32] a0*c0 b0*c0 g0*c0 r0*c0
71 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
72 accum = _mm_add_epi32(accum, t);
73 // [32] a1*c1 b1*c1 g1*c1 r1*c1
74 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
75 accum = _mm_add_epi32(accum, t);
77 // Duplicate 3rd and 4th coefficients for all channels =>
78 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
79 // => accumulate the convolution results.
80 // [16] xx xx xx xx c3 c3 c2 c2
81 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
82 // [16] c3 c3 c3 c3 c2 c2 c2 c2
83 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
84 // [16] a3 g3 b3 r3 a2 g2 b2 r2
85 src16 = _mm_unpackhi_epi8(src8, zero);
86 mul_hi = _mm_mulhi_epi16(src16, coeff16);
87 mul_lo = _mm_mullo_epi16(src16, coeff16);
88 // [32] a2*c2 b2*c2 g2*c2 r2*c2
89 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
90 accum = _mm_add_epi32(accum, t);
91 // [32] a3*c3 b3*c3 g3*c3 r3*c3
92 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
93 accum = _mm_add_epi32(accum, t);
95 // Advance the pixel and coefficients pointers.
96 rowToFilter += 16;
97 filterValues += 4;
100 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
101 // coefficients one at a time.
102 int r = filterLength & 3;
103 if (r) {
104 int remainderOffset = (filterOffset + filterLength - r) * 4;
105 AccumRemainder(srcData + remainderOffset, filterValues, accum, r);
108 // Shift right for fixed point implementation.
109 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
111 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
112 accum = _mm_packs_epi32(accum, zero);
113 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
114 accum = _mm_packus_epi16(accum, zero);
116 // Store the pixel value of 32 bits.
117 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
118 outRow += 4;
122 // Does vertical convolution to produce one output row. The filter values and
123 // length are given in the first two parameters. These are applied to each
124 // of the rows pointed to in the |sourceDataRows| array, with each row
125 // being |pixelWidth| wide.
127 // The output must have room for |pixelWidth * 4| bytes.
128 template <bool hasAlpha>
129 static void ConvolveVertically(
130 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
131 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
132 unsigned char* outRow) {
133 // Output four pixels per iteration (16 bytes).
134 int width = pixelWidth & ~3;
135 __m128i zero = _mm_setzero_si128();
136 for (int outX = 0; outX < width; outX += 4) {
137 // Accumulated result for each pixel. 32 bits per RGBA channel.
138 __m128i accum0 = _mm_setzero_si128();
139 __m128i accum1 = _mm_setzero_si128();
140 __m128i accum2 = _mm_setzero_si128();
141 __m128i accum3 = _mm_setzero_si128();
143 // Convolve with one filter coefficient per iteration.
144 for (int filterY = 0; filterY < filterLength; filterY++) {
145 // Duplicate the filter coefficient 8 times.
146 // [16] cj cj cj cj cj cj cj cj
147 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
149 // Load four pixels (16 bytes) together.
150 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
151 const __m128i* src =
152 reinterpret_cast<const __m128i*>(&sourceDataRows[filterY][outX << 2]);
153 __m128i src8 = _mm_loadu_si128(src);
155 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
156 // multiply with current coefficient => accumulate the result.
157 // [16] a1 b1 g1 r1 a0 b0 g0 r0
158 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
159 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
160 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
161 // [32] a0 b0 g0 r0
162 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
163 accum0 = _mm_add_epi32(accum0, t);
164 // [32] a1 b1 g1 r1
165 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
166 accum1 = _mm_add_epi32(accum1, t);
168 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
169 // multiply with current coefficient => accumulate the result.
170 // [16] a3 b3 g3 r3 a2 b2 g2 r2
171 src16 = _mm_unpackhi_epi8(src8, zero);
172 mul_hi = _mm_mulhi_epi16(src16, coeff16);
173 mul_lo = _mm_mullo_epi16(src16, coeff16);
174 // [32] a2 b2 g2 r2
175 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
176 accum2 = _mm_add_epi32(accum2, t);
177 // [32] a3 b3 g3 r3
178 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
179 accum3 = _mm_add_epi32(accum3, t);
182 // Shift right for fixed point implementation.
183 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
184 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
185 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
186 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
188 // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
189 // [16] a1 b1 g1 r1 a0 b0 g0 r0
190 accum0 = _mm_packs_epi32(accum0, accum1);
191 // [16] a3 b3 g3 r3 a2 b2 g2 r2
192 accum2 = _mm_packs_epi32(accum2, accum3);
194 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
195 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
196 accum0 = _mm_packus_epi16(accum0, accum2);
198 if (hasAlpha) {
199 // Compute the max(ri, gi, bi) for each pixel.
200 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
201 __m128i a = _mm_srli_epi32(accum0, 8);
202 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
203 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
204 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
205 a = _mm_srli_epi32(accum0, 16);
206 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
207 b = _mm_max_epu8(a, b); // Max of r and g and b.
208 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
209 b = _mm_slli_epi32(b, 24);
211 // Make sure the value of alpha channel is always larger than maximum
212 // value of color channels.
213 accum0 = _mm_max_epu8(b, accum0);
214 } else {
215 // Set value of alpha channels to 0xFF.
216 __m128i mask = _mm_set1_epi32(0xff000000);
217 accum0 = _mm_or_si128(accum0, mask);
220 // Store the convolution result (16 bytes) and advance the pixel pointers.
221 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
222 outRow += 16;
225 // When the width of the output is not divisible by 4, We need to save one
226 // pixel (4 bytes) each time. And also the fourth pixel is always absent.
227 int r = pixelWidth & 3;
228 if (r) {
229 __m128i accum0 = _mm_setzero_si128();
230 __m128i accum1 = _mm_setzero_si128();
231 __m128i accum2 = _mm_setzero_si128();
232 for (int filterY = 0; filterY < filterLength; ++filterY) {
233 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
234 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
235 const __m128i* src = reinterpret_cast<const __m128i*>(
236 &sourceDataRows[filterY][width << 2]);
237 __m128i src8 = _mm_loadu_si128(src);
238 // [16] a1 b1 g1 r1 a0 b0 g0 r0
239 __m128i src16 = _mm_unpacklo_epi8(src8, zero);
240 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
241 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
242 // [32] a0 b0 g0 r0
243 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
244 accum0 = _mm_add_epi32(accum0, t);
245 // [32] a1 b1 g1 r1
246 t = _mm_unpackhi_epi16(mul_lo, mul_hi);
247 accum1 = _mm_add_epi32(accum1, t);
248 // [16] a3 b3 g3 r3 a2 b2 g2 r2
249 src16 = _mm_unpackhi_epi8(src8, zero);
250 mul_hi = _mm_mulhi_epi16(src16, coeff16);
251 mul_lo = _mm_mullo_epi16(src16, coeff16);
252 // [32] a2 b2 g2 r2
253 t = _mm_unpacklo_epi16(mul_lo, mul_hi);
254 accum2 = _mm_add_epi32(accum2, t);
257 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
258 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
259 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
260 // [16] a1 b1 g1 r1 a0 b0 g0 r0
261 accum0 = _mm_packs_epi32(accum0, accum1);
262 // [16] a3 b3 g3 r3 a2 b2 g2 r2
263 accum2 = _mm_packs_epi32(accum2, zero);
264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
265 accum0 = _mm_packus_epi16(accum0, accum2);
266 if (hasAlpha) {
267 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
268 __m128i a = _mm_srli_epi32(accum0, 8);
269 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
270 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g.
271 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
272 a = _mm_srli_epi32(accum0, 16);
273 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
274 b = _mm_max_epu8(a, b); // Max of r and g and b.
275 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
276 b = _mm_slli_epi32(b, 24);
277 accum0 = _mm_max_epu8(b, accum0);
278 } else {
279 __m128i mask = _mm_set1_epi32(0xff000000);
280 accum0 = _mm_or_si128(accum0, mask);
283 for (int i = 0; i < r; i++) {
284 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
285 accum0 = _mm_srli_si128(accum0, 4);
286 outRow += 4;
291 void convolve_vertically_sse2(
292 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
293 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
294 unsigned char* outRow, bool hasAlpha) {
295 if (hasAlpha) {
296 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
297 pixelWidth, outRow);
298 } else {
299 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
300 pixelWidth, outRow);
304 } // namespace skia