Bug 1890689 accumulate input in LargerReceiverBlockSizeThanDesiredBuffering GTest...
[gecko.git] / gfx / 2d / SkConvolver.cpp
blobb89a486d4855014d019c3c89e9bdda1c94b7220d
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
9 #ifdef USE_SSE2
10 # include "mozilla/SSE.h"
11 #endif
13 #ifdef USE_NEON
14 # include "mozilla/arm.h"
15 #endif
17 namespace skia {
19 // Converts the argument to an 8-bit unsigned value by clamping to the range
20 // 0-255.
21 static inline unsigned char ClampTo8(int a) {
22 if (static_cast<unsigned>(a) < 256) {
23 return a; // Avoid the extra check in the common case.
25 if (a < 0) {
26 return 0;
28 return 255;
31 // Convolves horizontally along a single row. The row data is given in
32 // |srcData| and continues for the numValues() of the filter.
33 template <bool hasAlpha>
34 void ConvolveHorizontally(const unsigned char* srcData,
35 const SkConvolutionFilter1D& filter,
36 unsigned char* outRow) {
37 // Loop over each pixel on this row in the output image.
38 int numValues = filter.numValues();
39 for (int outX = 0; outX < numValues; outX++) {
40 // Get the filter that determines the current output pixel.
41 int filterOffset, filterLength;
42 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
43 filter.FilterForValue(outX, &filterOffset, &filterLength);
45 // Compute the first pixel in this row that the filter affects. It will
46 // touch |filterLength| pixels (4 bytes each) after this.
47 const unsigned char* rowToFilter = &srcData[filterOffset * 4];
49 // Apply the filter to the row to get the destination pixel in |accum|.
50 int accum[4] = {0};
51 for (int filterX = 0; filterX < filterLength; filterX++) {
52 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterX];
53 accum[0] += curFilter * rowToFilter[filterX * 4 + 0];
54 accum[1] += curFilter * rowToFilter[filterX * 4 + 1];
55 accum[2] += curFilter * rowToFilter[filterX * 4 + 2];
56 if (hasAlpha) {
57 accum[3] += curFilter * rowToFilter[filterX * 4 + 3];
61 // Bring this value back in range. All of the filter scaling factors
62 // are in fixed point with kShiftBits bits of fractional part.
63 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
64 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
65 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
67 if (hasAlpha) {
68 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
71 // Store the new pixel.
72 outRow[outX * 4 + 0] = ClampTo8(accum[0]);
73 outRow[outX * 4 + 1] = ClampTo8(accum[1]);
74 outRow[outX * 4 + 2] = ClampTo8(accum[2]);
75 if (hasAlpha) {
76 outRow[outX * 4 + 3] = ClampTo8(accum[3]);
81 // Does vertical convolution to produce one output row. The filter values and
82 // length are given in the first two parameters. These are applied to each
83 // of the rows pointed to in the |sourceDataRows| array, with each row
84 // being |pixelWidth| wide.
86 // The output must have room for |pixelWidth * 4| bytes.
87 template <bool hasAlpha>
88 void ConvolveVertically(
89 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
90 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
91 unsigned char* outRow) {
92 // We go through each column in the output and do a vertical convolution,
93 // generating one output pixel each time.
94 for (int outX = 0; outX < pixelWidth; outX++) {
95 // Compute the number of bytes over in each row that the current column
96 // we're convolving starts at. The pixel will cover the next 4 bytes.
97 int byteOffset = outX * 4;
99 // Apply the filter to one column of pixels.
100 int accum[4] = {0};
101 for (int filterY = 0; filterY < filterLength; filterY++) {
102 SkConvolutionFilter1D::ConvolutionFixed curFilter = filterValues[filterY];
103 accum[0] += curFilter * sourceDataRows[filterY][byteOffset + 0];
104 accum[1] += curFilter * sourceDataRows[filterY][byteOffset + 1];
105 accum[2] += curFilter * sourceDataRows[filterY][byteOffset + 2];
106 if (hasAlpha) {
107 accum[3] += curFilter * sourceDataRows[filterY][byteOffset + 3];
111 // Bring this value back in range. All of the filter scaling factors
112 // are in fixed point with kShiftBits bits of precision.
113 accum[0] >>= SkConvolutionFilter1D::kShiftBits;
114 accum[1] >>= SkConvolutionFilter1D::kShiftBits;
115 accum[2] >>= SkConvolutionFilter1D::kShiftBits;
116 if (hasAlpha) {
117 accum[3] >>= SkConvolutionFilter1D::kShiftBits;
120 // Store the new pixel.
121 outRow[byteOffset + 0] = ClampTo8(accum[0]);
122 outRow[byteOffset + 1] = ClampTo8(accum[1]);
123 outRow[byteOffset + 2] = ClampTo8(accum[2]);
125 if (hasAlpha) {
126 unsigned char alpha = ClampTo8(accum[3]);
128 // Make sure the alpha channel doesn't come out smaller than any of the
129 // color channels. We use premultipled alpha channels, so this should
130 // never happen, but rounding errors will cause this from time to time.
131 // These "impossible" colors will cause overflows (and hence random pixel
132 // values) when the resulting bitmap is drawn to the screen.
134 // We only need to do this when generating the final output row (here).
135 int maxColorChannel =
136 std::max(outRow[byteOffset + 0],
137 std::max(outRow[byteOffset + 1], outRow[byteOffset + 2]));
138 if (alpha < maxColorChannel) {
139 outRow[byteOffset + 3] = maxColorChannel;
140 } else {
141 outRow[byteOffset + 3] = alpha;
143 } else {
144 // No alpha channel, the image is opaque.
145 outRow[byteOffset + 3] = 0xff;
150 #ifdef USE_SSE2
151 void convolve_vertically_avx2(const int16_t* filter, int filterLen,
152 uint8_t* const* srcRows, int width, uint8_t* out,
153 bool hasAlpha);
154 void convolve_horizontally_sse2(const unsigned char* srcData,
155 const SkConvolutionFilter1D& filter,
156 unsigned char* outRow, bool hasAlpha);
157 void convolve_vertically_sse2(const int16_t* filter, int filterLen,
158 uint8_t* const* srcRows, int width, uint8_t* out,
159 bool hasAlpha);
160 #elif defined(USE_NEON)
161 void convolve_horizontally_neon(const unsigned char* srcData,
162 const SkConvolutionFilter1D& filter,
163 unsigned char* outRow, bool hasAlpha);
164 void convolve_vertically_neon(const int16_t* filter, int filterLen,
165 uint8_t* const* srcRows, int width, uint8_t* out,
166 bool hasAlpha);
167 #endif
169 void convolve_horizontally(const unsigned char* srcData,
170 const SkConvolutionFilter1D& filter,
171 unsigned char* outRow, bool hasAlpha) {
172 #ifdef USE_SSE2
173 if (mozilla::supports_sse2()) {
174 convolve_horizontally_sse2(srcData, filter, outRow, hasAlpha);
175 return;
177 #elif defined(USE_NEON)
178 if (mozilla::supports_neon()) {
179 convolve_horizontally_neon(srcData, filter, outRow, hasAlpha);
180 return;
182 #endif
183 if (hasAlpha) {
184 ConvolveHorizontally<true>(srcData, filter, outRow);
185 } else {
186 ConvolveHorizontally<false>(srcData, filter, outRow);
190 void convolve_vertically(
191 const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
192 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
193 unsigned char* outRow, bool hasAlpha) {
194 #ifdef USE_SSE2
195 if (mozilla::supports_avx2()) {
196 convolve_vertically_avx2(filterValues, filterLength, sourceDataRows,
197 pixelWidth, outRow, hasAlpha);
198 return;
200 if (mozilla::supports_sse2()) {
201 convolve_vertically_sse2(filterValues, filterLength, sourceDataRows,
202 pixelWidth, outRow, hasAlpha);
203 return;
205 #elif defined(USE_NEON)
206 if (mozilla::supports_neon()) {
207 convolve_vertically_neon(filterValues, filterLength, sourceDataRows,
208 pixelWidth, outRow, hasAlpha);
209 return;
211 #endif
212 if (hasAlpha) {
213 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
214 pixelWidth, outRow);
215 } else {
216 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
217 pixelWidth, outRow);
221 // Stores a list of rows in a circular buffer. The usage is you write into it
222 // by calling AdvanceRow. It will keep track of which row in the buffer it
223 // should use next, and the total number of rows added.
224 class CircularRowBuffer {
225 public:
226 // The number of pixels in each row is given in |sourceRowPixelWidth|.
227 // The maximum number of rows needed in the buffer is |maxYFilterSize|
228 // (we only need to store enough rows for the biggest filter).
230 // We use the |firstInputRow| to compute the coordinates of all of the
231 // following rows returned by Advance().
232 CircularRowBuffer(int destRowPixelWidth, int maxYFilterSize,
233 int firstInputRow)
234 : fRowByteWidth(destRowPixelWidth * 4),
235 fNumRows(maxYFilterSize),
236 fNextRow(0),
237 fNextRowCoordinate(firstInputRow) {}
239 bool AllocBuffer() {
240 return fBuffer.resize(fRowByteWidth * fNumRows) &&
241 fRowAddresses.resize(fNumRows);
244 // Moves to the next row in the buffer, returning a pointer to the beginning
245 // of it.
246 unsigned char* advanceRow() {
247 unsigned char* row = &fBuffer[fNextRow * fRowByteWidth];
248 fNextRowCoordinate++;
250 // Set the pointer to the next row to use, wrapping around if necessary.
251 fNextRow++;
252 if (fNextRow == fNumRows) {
253 fNextRow = 0;
255 return row;
258 // Returns a pointer to an "unrolled" array of rows. These rows will start
259 // at the y coordinate placed into |*firstRowIndex| and will continue in
260 // order for the maximum number of rows in this circular buffer.
262 // The |firstRowIndex_| may be negative. This means the circular buffer
263 // starts before the top of the image (it hasn't been filled yet).
264 unsigned char* const* GetRowAddresses(int* firstRowIndex) {
265 // Example for a 4-element circular buffer holding coords 6-9.
266 // Row 0 Coord 8
267 // Row 1 Coord 9
268 // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10.
269 // Row 3 Coord 7
271 // The "next" row is also the first (lowest) coordinate. This computation
272 // may yield a negative value, but that's OK, the math will work out
273 // since the user of this buffer will compute the offset relative
274 // to the firstRowIndex and the negative rows will never be used.
275 *firstRowIndex = fNextRowCoordinate - fNumRows;
277 int curRow = fNextRow;
278 for (int i = 0; i < fNumRows; i++) {
279 fRowAddresses[i] = &fBuffer[curRow * fRowByteWidth];
281 // Advance to the next row, wrapping if necessary.
282 curRow++;
283 if (curRow == fNumRows) {
284 curRow = 0;
287 return &fRowAddresses[0];
290 private:
291 // The buffer storing the rows. They are packed, each one fRowByteWidth.
292 mozilla::Vector<unsigned char> fBuffer;
294 // Number of bytes per row in the |buffer|.
295 int fRowByteWidth;
297 // The number of rows available in the buffer.
298 int fNumRows;
300 // The next row index we should write into. This wraps around as the
301 // circular buffer is used.
302 int fNextRow;
304 // The y coordinate of the |fNextRow|. This is incremented each time a
305 // new row is appended and does not wrap.
306 int fNextRowCoordinate;
308 // Buffer used by GetRowAddresses().
309 mozilla::Vector<unsigned char*> fRowAddresses;
312 SkConvolutionFilter1D::SkConvolutionFilter1D() : fMaxFilter(0) {}
314 SkConvolutionFilter1D::~SkConvolutionFilter1D() = default;
316 bool SkConvolutionFilter1D::AddFilter(int filterOffset,
317 const ConvolutionFixed* filterValues,
318 int filterLength) {
319 // It is common for leading/trailing filter values to be zeros. In such
320 // cases it is beneficial to only store the central factors.
321 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
322 // a 1080p image this optimization gives a ~10% speed improvement.
323 int filterSize = filterLength;
324 int firstNonZero = 0;
325 while (firstNonZero < filterLength && filterValues[firstNonZero] == 0) {
326 firstNonZero++;
329 if (firstNonZero < filterLength) {
330 // Here we have at least one non-zero factor.
331 int lastNonZero = filterLength - 1;
332 while (lastNonZero >= 0 && filterValues[lastNonZero] == 0) {
333 lastNonZero--;
336 filterOffset += firstNonZero;
337 filterLength = lastNonZero + 1 - firstNonZero;
338 MOZ_ASSERT(filterLength > 0);
340 if (!fFilterValues.append(&filterValues[firstNonZero], filterLength)) {
341 return false;
343 } else {
344 // Here all the factors were zeroes.
345 filterLength = 0;
348 FilterInstance instance = {
349 // We pushed filterLength elements onto fFilterValues
350 int(fFilterValues.length()) - filterLength, filterOffset, filterLength,
351 filterSize};
352 if (!fFilters.append(instance)) {
353 if (filterLength > 0) {
354 fFilterValues.shrinkBy(filterLength);
356 return false;
359 fMaxFilter = std::max(fMaxFilter, filterLength);
360 return true;
363 bool SkConvolutionFilter1D::ComputeFilterValues(
364 const SkBitmapFilter& aBitmapFilter, int32_t aSrcSize, int32_t aDstSize) {
365 // When we're doing a magnification, the scale will be larger than one. This
366 // means the destination pixels are much smaller than the source pixels, and
367 // that the range covered by the filter won't necessarily cover any source
368 // pixel boundaries. Therefore, we use these clamped values (max of 1) for
369 // some computations.
370 float scale = float(aDstSize) / float(aSrcSize);
371 float clampedScale = std::min(1.0f, scale);
372 // This is how many source pixels from the center we need to count
373 // to support the filtering function.
374 float srcSupport = aBitmapFilter.width() / clampedScale;
375 float invScale = 1.0f / scale;
377 mozilla::Vector<float, 64> filterValues;
378 mozilla::Vector<ConvolutionFixed, 64> fixedFilterValues;
380 // Loop over all pixels in the output range. We will generate one set of
381 // filter values for each one. Those values will tell us how to blend the
382 // source pixels to compute the destination pixel.
384 // This value is computed based on how SkTDArray::resizeStorageToAtLeast works
385 // in order to ensure that it does not overflow or assert. That functions
386 // computes
387 // n+4 + (n+4)/4
388 // and we want to to fit in a 32 bit signed int. Equating that to 2^31-1 and
389 // solving n gives n = (2^31-6)*4/5 = 1717986913.6
390 const int32_t maxToPassToReserveAdditional = 1717986913;
392 int32_t filterValueCount = int32_t(ceilf(aDstSize * srcSupport * 2));
393 if (aDstSize > maxToPassToReserveAdditional || filterValueCount < 0 ||
394 filterValueCount > maxToPassToReserveAdditional ||
395 !reserveAdditional(aDstSize, filterValueCount)) {
396 return false;
398 size_t oldFiltersLength = fFilters.length();
399 size_t oldFilterValuesLength = fFilterValues.length();
400 int oldMaxFilter = fMaxFilter;
401 for (int32_t destI = 0; destI < aDstSize; destI++) {
402 // This is the pixel in the source directly under the pixel in the dest.
403 // Note that we base computations on the "center" of the pixels. To see
404 // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
405 // downscale should "cover" the pixels around the pixel with *its center*
406 // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
407 // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
408 float srcPixel = (static_cast<float>(destI) + 0.5f) * invScale;
410 // Compute the (inclusive) range of source pixels the filter covers.
411 float srcBegin = std::max(0.0f, floorf(srcPixel - srcSupport));
412 float srcEnd = std::min(aSrcSize - 1.0f, ceilf(srcPixel + srcSupport));
414 // Compute the unnormalized filter value at each location of the source
415 // it covers.
417 // Sum of the filter values for normalizing.
418 // Distance from the center of the filter, this is the filter coordinate
419 // in source space. We also need to consider the center of the pixel
420 // when comparing distance against 'srcPixel'. In the 5x downscale
421 // example used above the distance from the center of the filter to
422 // the pixel with coordinates (2, 2) should be 0, because its center
423 // is at (2.5, 2.5).
424 int32_t filterCount = int32_t(srcEnd - srcBegin) + 1;
425 if (filterCount <= 0 || !filterValues.resize(filterCount) ||
426 !fixedFilterValues.resize(filterCount)) {
427 return false;
430 float destFilterDist = (srcBegin + 0.5f - srcPixel) * clampedScale;
431 float filterSum = 0.0f;
432 for (int32_t index = 0; index < filterCount; index++) {
433 float filterValue = aBitmapFilter.evaluate(destFilterDist);
434 filterValues[index] = filterValue;
435 filterSum += filterValue;
436 destFilterDist += clampedScale;
439 // The filter must be normalized so that we don't affect the brightness of
440 // the image. Convert to normalized fixed point.
441 ConvolutionFixed fixedSum = 0;
442 float invFilterSum = 1.0f / filterSum;
443 for (int32_t fixedI = 0; fixedI < filterCount; fixedI++) {
444 ConvolutionFixed curFixed = ToFixed(filterValues[fixedI] * invFilterSum);
445 fixedSum += curFixed;
446 fixedFilterValues[fixedI] = curFixed;
449 // The conversion to fixed point will leave some rounding errors, which
450 // we add back in to avoid affecting the brightness of the image. We
451 // arbitrarily add this to the center of the filter array (this won't always
452 // be the center of the filter function since it could get clipped on the
453 // edges, but it doesn't matter enough to worry about that case).
454 ConvolutionFixed leftovers = ToFixed(1) - fixedSum;
455 fixedFilterValues[filterCount / 2] += leftovers;
457 if (!AddFilter(int32_t(srcBegin), fixedFilterValues.begin(), filterCount)) {
458 fFilters.shrinkTo(oldFiltersLength);
459 fFilterValues.shrinkTo(oldFilterValuesLength);
460 fMaxFilter = oldMaxFilter;
461 return false;
465 return maxFilter() > 0 && numValues() == aDstSize;
468 // Does a two-dimensional convolution on the given source image.
470 // It is assumed the source pixel offsets referenced in the input filters
471 // reference only valid pixels, so the source image size is not required. Each
472 // row of the source image starts |sourceByteRowStride| after the previous
473 // one (this allows you to have rows with some padding at the end).
475 // The result will be put into the given output buffer. The destination image
476 // size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
477 // in rows of exactly xfilter.numValues() * 4 bytes.
479 // |sourceHasAlpha| is a hint that allows us to avoid doing computations on
480 // the alpha channel if the image is opaque. If you don't know, set this to
481 // true and it will work properly, but setting this to false will be a few
482 // percent faster if you know the image is opaque.
484 // The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
485 // (this is ARGB when loaded into 32-bit words on a little-endian machine).
487 * Returns false if it was unable to perform the convolution/rescale. in which
488 * case the output buffer is assumed to be undefined.
490 bool BGRAConvolve2D(const unsigned char* sourceData, int sourceByteRowStride,
491 bool sourceHasAlpha, const SkConvolutionFilter1D& filterX,
492 const SkConvolutionFilter1D& filterY,
493 int outputByteRowStride, unsigned char* output) {
494 int maxYFilterSize = filterY.maxFilter();
496 // The next row in the input that we will generate a horizontally
497 // convolved row for. If the filter doesn't start at the beginning of the
498 // image (this is the case when we are only resizing a subset), then we
499 // don't want to generate any output rows before that. Compute the starting
500 // row for convolution as the first pixel for the first vertical filter.
501 int filterOffset = 0, filterLength = 0;
502 const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
503 filterY.FilterForValue(0, &filterOffset, &filterLength);
504 int nextXRow = filterOffset;
506 // We loop over each row in the input doing a horizontal convolution. This
507 // will result in a horizontally convolved image. We write the results into
508 // a circular buffer of convolved rows and do vertical convolution as rows
509 // are available. This prevents us from having to store the entire
510 // intermediate image and helps cache coherency.
511 // We will need four extra rows to allow horizontal convolution could be done
512 // simultaneously. We also pad each row in row buffer to be aligned-up to
513 // 32 bytes.
514 // TODO(jiesun): We do not use aligned load from row buffer in vertical
515 // convolution pass yet. Somehow Windows does not like it.
516 int rowBufferWidth = (filterX.numValues() + 31) & ~0x1F;
517 int rowBufferHeight = maxYFilterSize;
519 // check for too-big allocation requests : crbug.com/528628
521 int64_t size = int64_t(rowBufferWidth) * int64_t(rowBufferHeight);
522 // need some limit, to avoid over-committing success from malloc, but then
523 // crashing when we try to actually use the memory.
524 // 100meg seems big enough to allow "normal" zoom factors and image sizes
525 // through while avoiding the crash seen by the bug (crbug.com/528628)
526 if (size > 100 * 1024 * 1024) {
527 // printf_stderr("BGRAConvolve2D: tmp allocation [%lld] too
528 // big\n", size);
529 return false;
533 CircularRowBuffer rowBuffer(rowBufferWidth, rowBufferHeight, filterOffset);
534 if (!rowBuffer.AllocBuffer()) {
535 return false;
538 // Loop over every possible output row, processing just enough horizontal
539 // convolutions to run each subsequent vertical convolution.
540 MOZ_ASSERT(outputByteRowStride >= filterX.numValues() * 4);
541 int numOutputRows = filterY.numValues();
543 // We need to check which is the last line to convolve before we advance 4
544 // lines in one iteration.
545 int lastFilterOffset, lastFilterLength;
546 filterY.FilterForValue(numOutputRows - 1, &lastFilterOffset,
547 &lastFilterLength);
549 for (int outY = 0; outY < numOutputRows; outY++) {
550 filterValues = filterY.FilterForValue(outY, &filterOffset, &filterLength);
552 // Generate output rows until we have enough to run the current filter.
553 while (nextXRow < filterOffset + filterLength) {
554 convolve_horizontally(
555 &sourceData[(uint64_t)nextXRow * sourceByteRowStride], filterX,
556 rowBuffer.advanceRow(), sourceHasAlpha);
557 nextXRow++;
560 // Compute where in the output image this row of final data will go.
561 unsigned char* curOutputRow = &output[(uint64_t)outY * outputByteRowStride];
563 // Get the list of rows that the circular buffer has, in order.
564 int firstRowInCircularBuffer;
565 unsigned char* const* rowsToConvolve =
566 rowBuffer.GetRowAddresses(&firstRowInCircularBuffer);
568 // Now compute the start of the subset of those rows that the filter needs.
569 unsigned char* const* firstRowForFilter =
570 &rowsToConvolve[filterOffset - firstRowInCircularBuffer];
572 convolve_vertically(filterValues, filterLength, firstRowForFilter,
573 filterX.numValues(), curOutputRow, sourceHasAlpha);
575 return true;
578 } // namespace skia