1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
3 // Copyright (c) 2011-2016 Google Inc.
4 // Use of this source code is governed by a BSD-style license that can be
5 // found in the gfx/skia/LICENSE file.
7 #include "SkConvolver.h"
10 # include "mozilla/SSE.h"
14 # include "mozilla/arm.h"
19 // Converts the argument to an 8-bit unsigned value by clamping to the range
21 static inline unsigned char ClampTo8(int a
) {
22 if (static_cast<unsigned>(a
) < 256) {
23 return a
; // Avoid the extra check in the common case.
31 // Convolves horizontally along a single row. The row data is given in
32 // |srcData| and continues for the numValues() of the filter.
33 template <bool hasAlpha
>
34 void ConvolveHorizontally(const unsigned char* srcData
,
35 const SkConvolutionFilter1D
& filter
,
36 unsigned char* outRow
) {
37 // Loop over each pixel on this row in the output image.
38 int numValues
= filter
.numValues();
39 for (int outX
= 0; outX
< numValues
; outX
++) {
40 // Get the filter that determines the current output pixel.
41 int filterOffset
, filterLength
;
42 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
=
43 filter
.FilterForValue(outX
, &filterOffset
, &filterLength
);
45 // Compute the first pixel in this row that the filter affects. It will
46 // touch |filterLength| pixels (4 bytes each) after this.
47 const unsigned char* rowToFilter
= &srcData
[filterOffset
* 4];
49 // Apply the filter to the row to get the destination pixel in |accum|.
51 for (int filterX
= 0; filterX
< filterLength
; filterX
++) {
52 SkConvolutionFilter1D::ConvolutionFixed curFilter
= filterValues
[filterX
];
53 accum
[0] += curFilter
* rowToFilter
[filterX
* 4 + 0];
54 accum
[1] += curFilter
* rowToFilter
[filterX
* 4 + 1];
55 accum
[2] += curFilter
* rowToFilter
[filterX
* 4 + 2];
57 accum
[3] += curFilter
* rowToFilter
[filterX
* 4 + 3];
61 // Bring this value back in range. All of the filter scaling factors
62 // are in fixed point with kShiftBits bits of fractional part.
63 accum
[0] >>= SkConvolutionFilter1D::kShiftBits
;
64 accum
[1] >>= SkConvolutionFilter1D::kShiftBits
;
65 accum
[2] >>= SkConvolutionFilter1D::kShiftBits
;
68 accum
[3] >>= SkConvolutionFilter1D::kShiftBits
;
71 // Store the new pixel.
72 outRow
[outX
* 4 + 0] = ClampTo8(accum
[0]);
73 outRow
[outX
* 4 + 1] = ClampTo8(accum
[1]);
74 outRow
[outX
* 4 + 2] = ClampTo8(accum
[2]);
76 outRow
[outX
* 4 + 3] = ClampTo8(accum
[3]);
81 // Does vertical convolution to produce one output row. The filter values and
82 // length are given in the first two parameters. These are applied to each
83 // of the rows pointed to in the |sourceDataRows| array, with each row
84 // being |pixelWidth| wide.
86 // The output must have room for |pixelWidth * 4| bytes.
87 template <bool hasAlpha
>
88 void ConvolveVertically(
89 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
90 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
91 unsigned char* outRow
) {
92 // We go through each column in the output and do a vertical convolution,
93 // generating one output pixel each time.
94 for (int outX
= 0; outX
< pixelWidth
; outX
++) {
95 // Compute the number of bytes over in each row that the current column
96 // we're convolving starts at. The pixel will cover the next 4 bytes.
97 int byteOffset
= outX
* 4;
99 // Apply the filter to one column of pixels.
101 for (int filterY
= 0; filterY
< filterLength
; filterY
++) {
102 SkConvolutionFilter1D::ConvolutionFixed curFilter
= filterValues
[filterY
];
103 accum
[0] += curFilter
* sourceDataRows
[filterY
][byteOffset
+ 0];
104 accum
[1] += curFilter
* sourceDataRows
[filterY
][byteOffset
+ 1];
105 accum
[2] += curFilter
* sourceDataRows
[filterY
][byteOffset
+ 2];
107 accum
[3] += curFilter
* sourceDataRows
[filterY
][byteOffset
+ 3];
111 // Bring this value back in range. All of the filter scaling factors
112 // are in fixed point with kShiftBits bits of precision.
113 accum
[0] >>= SkConvolutionFilter1D::kShiftBits
;
114 accum
[1] >>= SkConvolutionFilter1D::kShiftBits
;
115 accum
[2] >>= SkConvolutionFilter1D::kShiftBits
;
117 accum
[3] >>= SkConvolutionFilter1D::kShiftBits
;
120 // Store the new pixel.
121 outRow
[byteOffset
+ 0] = ClampTo8(accum
[0]);
122 outRow
[byteOffset
+ 1] = ClampTo8(accum
[1]);
123 outRow
[byteOffset
+ 2] = ClampTo8(accum
[2]);
126 unsigned char alpha
= ClampTo8(accum
[3]);
128 // Make sure the alpha channel doesn't come out smaller than any of the
129 // color channels. We use premultipled alpha channels, so this should
130 // never happen, but rounding errors will cause this from time to time.
131 // These "impossible" colors will cause overflows (and hence random pixel
132 // values) when the resulting bitmap is drawn to the screen.
134 // We only need to do this when generating the final output row (here).
135 int maxColorChannel
=
136 std::max(outRow
[byteOffset
+ 0],
137 std::max(outRow
[byteOffset
+ 1], outRow
[byteOffset
+ 2]));
138 if (alpha
< maxColorChannel
) {
139 outRow
[byteOffset
+ 3] = maxColorChannel
;
141 outRow
[byteOffset
+ 3] = alpha
;
144 // No alpha channel, the image is opaque.
145 outRow
[byteOffset
+ 3] = 0xff;
151 void convolve_vertically_avx2(const int16_t* filter
, int filterLen
,
152 uint8_t* const* srcRows
, int width
, uint8_t* out
,
154 void convolve_horizontally_sse2(const unsigned char* srcData
,
155 const SkConvolutionFilter1D
& filter
,
156 unsigned char* outRow
, bool hasAlpha
);
157 void convolve_vertically_sse2(const int16_t* filter
, int filterLen
,
158 uint8_t* const* srcRows
, int width
, uint8_t* out
,
160 #elif defined(USE_NEON)
161 void convolve_horizontally_neon(const unsigned char* srcData
,
162 const SkConvolutionFilter1D
& filter
,
163 unsigned char* outRow
, bool hasAlpha
);
164 void convolve_vertically_neon(const int16_t* filter
, int filterLen
,
165 uint8_t* const* srcRows
, int width
, uint8_t* out
,
169 void convolve_horizontally(const unsigned char* srcData
,
170 const SkConvolutionFilter1D
& filter
,
171 unsigned char* outRow
, bool hasAlpha
) {
173 if (mozilla::supports_sse2()) {
174 convolve_horizontally_sse2(srcData
, filter
, outRow
, hasAlpha
);
177 #elif defined(USE_NEON)
178 if (mozilla::supports_neon()) {
179 convolve_horizontally_neon(srcData
, filter
, outRow
, hasAlpha
);
184 ConvolveHorizontally
<true>(srcData
, filter
, outRow
);
186 ConvolveHorizontally
<false>(srcData
, filter
, outRow
);
190 void convolve_vertically(
191 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
,
192 int filterLength
, unsigned char* const* sourceDataRows
, int pixelWidth
,
193 unsigned char* outRow
, bool hasAlpha
) {
195 if (mozilla::supports_avx2()) {
196 convolve_vertically_avx2(filterValues
, filterLength
, sourceDataRows
,
197 pixelWidth
, outRow
, hasAlpha
);
200 if (mozilla::supports_sse2()) {
201 convolve_vertically_sse2(filterValues
, filterLength
, sourceDataRows
,
202 pixelWidth
, outRow
, hasAlpha
);
205 #elif defined(USE_NEON)
206 if (mozilla::supports_neon()) {
207 convolve_vertically_neon(filterValues
, filterLength
, sourceDataRows
,
208 pixelWidth
, outRow
, hasAlpha
);
213 ConvolveVertically
<true>(filterValues
, filterLength
, sourceDataRows
,
216 ConvolveVertically
<false>(filterValues
, filterLength
, sourceDataRows
,
221 // Stores a list of rows in a circular buffer. The usage is you write into it
222 // by calling AdvanceRow. It will keep track of which row in the buffer it
223 // should use next, and the total number of rows added.
224 class CircularRowBuffer
{
226 // The number of pixels in each row is given in |sourceRowPixelWidth|.
227 // The maximum number of rows needed in the buffer is |maxYFilterSize|
228 // (we only need to store enough rows for the biggest filter).
230 // We use the |firstInputRow| to compute the coordinates of all of the
231 // following rows returned by Advance().
232 CircularRowBuffer(int destRowPixelWidth
, int maxYFilterSize
,
234 : fRowByteWidth(destRowPixelWidth
* 4),
235 fNumRows(maxYFilterSize
),
237 fNextRowCoordinate(firstInputRow
) {}
240 return fBuffer
.resize(fRowByteWidth
* fNumRows
) &&
241 fRowAddresses
.resize(fNumRows
);
244 // Moves to the next row in the buffer, returning a pointer to the beginning
246 unsigned char* advanceRow() {
247 unsigned char* row
= &fBuffer
[fNextRow
* fRowByteWidth
];
248 fNextRowCoordinate
++;
250 // Set the pointer to the next row to use, wrapping around if necessary.
252 if (fNextRow
== fNumRows
) {
258 // Returns a pointer to an "unrolled" array of rows. These rows will start
259 // at the y coordinate placed into |*firstRowIndex| and will continue in
260 // order for the maximum number of rows in this circular buffer.
262 // The |firstRowIndex_| may be negative. This means the circular buffer
263 // starts before the top of the image (it hasn't been filled yet).
264 unsigned char* const* GetRowAddresses(int* firstRowIndex
) {
265 // Example for a 4-element circular buffer holding coords 6-9.
268 // Row 2 Coord 6 <- fNextRow = 2, fNextRowCoordinate = 10.
271 // The "next" row is also the first (lowest) coordinate. This computation
272 // may yield a negative value, but that's OK, the math will work out
273 // since the user of this buffer will compute the offset relative
274 // to the firstRowIndex and the negative rows will never be used.
275 *firstRowIndex
= fNextRowCoordinate
- fNumRows
;
277 int curRow
= fNextRow
;
278 for (int i
= 0; i
< fNumRows
; i
++) {
279 fRowAddresses
[i
] = &fBuffer
[curRow
* fRowByteWidth
];
281 // Advance to the next row, wrapping if necessary.
283 if (curRow
== fNumRows
) {
287 return &fRowAddresses
[0];
291 // The buffer storing the rows. They are packed, each one fRowByteWidth.
292 mozilla::Vector
<unsigned char> fBuffer
;
294 // Number of bytes per row in the |buffer|.
297 // The number of rows available in the buffer.
300 // The next row index we should write into. This wraps around as the
301 // circular buffer is used.
304 // The y coordinate of the |fNextRow|. This is incremented each time a
305 // new row is appended and does not wrap.
306 int fNextRowCoordinate
;
308 // Buffer used by GetRowAddresses().
309 mozilla::Vector
<unsigned char*> fRowAddresses
;
312 SkConvolutionFilter1D::SkConvolutionFilter1D() : fMaxFilter(0) {}
314 SkConvolutionFilter1D::~SkConvolutionFilter1D() = default;
316 bool SkConvolutionFilter1D::AddFilter(int filterOffset
,
317 const ConvolutionFixed
* filterValues
,
319 // It is common for leading/trailing filter values to be zeros. In such
320 // cases it is beneficial to only store the central factors.
321 // For a scaling to 1/4th in each dimension using a Lanczos-2 filter on
322 // a 1080p image this optimization gives a ~10% speed improvement.
323 int filterSize
= filterLength
;
324 int firstNonZero
= 0;
325 while (firstNonZero
< filterLength
&& filterValues
[firstNonZero
] == 0) {
329 if (firstNonZero
< filterLength
) {
330 // Here we have at least one non-zero factor.
331 int lastNonZero
= filterLength
- 1;
332 while (lastNonZero
>= 0 && filterValues
[lastNonZero
] == 0) {
336 filterOffset
+= firstNonZero
;
337 filterLength
= lastNonZero
+ 1 - firstNonZero
;
338 MOZ_ASSERT(filterLength
> 0);
340 if (!fFilterValues
.append(&filterValues
[firstNonZero
], filterLength
)) {
344 // Here all the factors were zeroes.
348 FilterInstance instance
= {
349 // We pushed filterLength elements onto fFilterValues
350 int(fFilterValues
.length()) - filterLength
, filterOffset
, filterLength
,
352 if (!fFilters
.append(instance
)) {
353 if (filterLength
> 0) {
354 fFilterValues
.shrinkBy(filterLength
);
359 fMaxFilter
= std::max(fMaxFilter
, filterLength
);
363 bool SkConvolutionFilter1D::ComputeFilterValues(
364 const SkBitmapFilter
& aBitmapFilter
, int32_t aSrcSize
, int32_t aDstSize
) {
365 // When we're doing a magnification, the scale will be larger than one. This
366 // means the destination pixels are much smaller than the source pixels, and
367 // that the range covered by the filter won't necessarily cover any source
368 // pixel boundaries. Therefore, we use these clamped values (max of 1) for
369 // some computations.
370 float scale
= float(aDstSize
) / float(aSrcSize
);
371 float clampedScale
= std::min(1.0f
, scale
);
372 // This is how many source pixels from the center we need to count
373 // to support the filtering function.
374 float srcSupport
= aBitmapFilter
.width() / clampedScale
;
375 float invScale
= 1.0f
/ scale
;
377 mozilla::Vector
<float, 64> filterValues
;
378 mozilla::Vector
<ConvolutionFixed
, 64> fixedFilterValues
;
380 // Loop over all pixels in the output range. We will generate one set of
381 // filter values for each one. Those values will tell us how to blend the
382 // source pixels to compute the destination pixel.
384 // This value is computed based on how SkTDArray::resizeStorageToAtLeast works
385 // in order to ensure that it does not overflow or assert. That functions
388 // and we want to to fit in a 32 bit signed int. Equating that to 2^31-1 and
389 // solving n gives n = (2^31-6)*4/5 = 1717986913.6
390 const int32_t maxToPassToReserveAdditional
= 1717986913;
392 int32_t filterValueCount
= int32_t(ceilf(aDstSize
* srcSupport
* 2));
393 if (aDstSize
> maxToPassToReserveAdditional
|| filterValueCount
< 0 ||
394 filterValueCount
> maxToPassToReserveAdditional
||
395 !reserveAdditional(aDstSize
, filterValueCount
)) {
398 size_t oldFiltersLength
= fFilters
.length();
399 size_t oldFilterValuesLength
= fFilterValues
.length();
400 int oldMaxFilter
= fMaxFilter
;
401 for (int32_t destI
= 0; destI
< aDstSize
; destI
++) {
402 // This is the pixel in the source directly under the pixel in the dest.
403 // Note that we base computations on the "center" of the pixels. To see
404 // why, observe that the destination pixel at coordinates (0, 0) in a 5.0x
405 // downscale should "cover" the pixels around the pixel with *its center*
406 // at coordinates (2.5, 2.5) in the source, not those around (0, 0).
407 // Hence we need to scale coordinates (0.5, 0.5), not (0, 0).
408 float srcPixel
= (static_cast<float>(destI
) + 0.5f
) * invScale
;
410 // Compute the (inclusive) range of source pixels the filter covers.
411 float srcBegin
= std::max(0.0f
, floorf(srcPixel
- srcSupport
));
412 float srcEnd
= std::min(aSrcSize
- 1.0f
, ceilf(srcPixel
+ srcSupport
));
414 // Compute the unnormalized filter value at each location of the source
417 // Sum of the filter values for normalizing.
418 // Distance from the center of the filter, this is the filter coordinate
419 // in source space. We also need to consider the center of the pixel
420 // when comparing distance against 'srcPixel'. In the 5x downscale
421 // example used above the distance from the center of the filter to
422 // the pixel with coordinates (2, 2) should be 0, because its center
424 int32_t filterCount
= int32_t(srcEnd
- srcBegin
) + 1;
425 if (filterCount
<= 0 || !filterValues
.resize(filterCount
) ||
426 !fixedFilterValues
.resize(filterCount
)) {
430 float destFilterDist
= (srcBegin
+ 0.5f
- srcPixel
) * clampedScale
;
431 float filterSum
= 0.0f
;
432 for (int32_t index
= 0; index
< filterCount
; index
++) {
433 float filterValue
= aBitmapFilter
.evaluate(destFilterDist
);
434 filterValues
[index
] = filterValue
;
435 filterSum
+= filterValue
;
436 destFilterDist
+= clampedScale
;
439 // The filter must be normalized so that we don't affect the brightness of
440 // the image. Convert to normalized fixed point.
441 ConvolutionFixed fixedSum
= 0;
442 float invFilterSum
= 1.0f
/ filterSum
;
443 for (int32_t fixedI
= 0; fixedI
< filterCount
; fixedI
++) {
444 ConvolutionFixed curFixed
= ToFixed(filterValues
[fixedI
] * invFilterSum
);
445 fixedSum
+= curFixed
;
446 fixedFilterValues
[fixedI
] = curFixed
;
449 // The conversion to fixed point will leave some rounding errors, which
450 // we add back in to avoid affecting the brightness of the image. We
451 // arbitrarily add this to the center of the filter array (this won't always
452 // be the center of the filter function since it could get clipped on the
453 // edges, but it doesn't matter enough to worry about that case).
454 ConvolutionFixed leftovers
= ToFixed(1) - fixedSum
;
455 fixedFilterValues
[filterCount
/ 2] += leftovers
;
457 if (!AddFilter(int32_t(srcBegin
), fixedFilterValues
.begin(), filterCount
)) {
458 fFilters
.shrinkTo(oldFiltersLength
);
459 fFilterValues
.shrinkTo(oldFilterValuesLength
);
460 fMaxFilter
= oldMaxFilter
;
465 return maxFilter() > 0 && numValues() == aDstSize
;
468 // Does a two-dimensional convolution on the given source image.
470 // It is assumed the source pixel offsets referenced in the input filters
471 // reference only valid pixels, so the source image size is not required. Each
472 // row of the source image starts |sourceByteRowStride| after the previous
473 // one (this allows you to have rows with some padding at the end).
475 // The result will be put into the given output buffer. The destination image
476 // size will be xfilter.numValues() * yfilter.numValues() pixels. It will be
477 // in rows of exactly xfilter.numValues() * 4 bytes.
479 // |sourceHasAlpha| is a hint that allows us to avoid doing computations on
480 // the alpha channel if the image is opaque. If you don't know, set this to
481 // true and it will work properly, but setting this to false will be a few
482 // percent faster if you know the image is opaque.
484 // The layout in memory is assumed to be 4-bytes per pixel in B-G-R-A order
485 // (this is ARGB when loaded into 32-bit words on a little-endian machine).
487 * Returns false if it was unable to perform the convolution/rescale. in which
488 * case the output buffer is assumed to be undefined.
490 bool BGRAConvolve2D(const unsigned char* sourceData
, int sourceByteRowStride
,
491 bool sourceHasAlpha
, const SkConvolutionFilter1D
& filterX
,
492 const SkConvolutionFilter1D
& filterY
,
493 int outputByteRowStride
, unsigned char* output
) {
494 int maxYFilterSize
= filterY
.maxFilter();
496 // The next row in the input that we will generate a horizontally
497 // convolved row for. If the filter doesn't start at the beginning of the
498 // image (this is the case when we are only resizing a subset), then we
499 // don't want to generate any output rows before that. Compute the starting
500 // row for convolution as the first pixel for the first vertical filter.
501 int filterOffset
= 0, filterLength
= 0;
502 const SkConvolutionFilter1D::ConvolutionFixed
* filterValues
=
503 filterY
.FilterForValue(0, &filterOffset
, &filterLength
);
504 int nextXRow
= filterOffset
;
506 // We loop over each row in the input doing a horizontal convolution. This
507 // will result in a horizontally convolved image. We write the results into
508 // a circular buffer of convolved rows and do vertical convolution as rows
509 // are available. This prevents us from having to store the entire
510 // intermediate image and helps cache coherency.
511 // We will need four extra rows to allow horizontal convolution could be done
512 // simultaneously. We also pad each row in row buffer to be aligned-up to
514 // TODO(jiesun): We do not use aligned load from row buffer in vertical
515 // convolution pass yet. Somehow Windows does not like it.
516 int rowBufferWidth
= (filterX
.numValues() + 31) & ~0x1F;
517 int rowBufferHeight
= maxYFilterSize
;
519 // check for too-big allocation requests : crbug.com/528628
521 int64_t size
= int64_t(rowBufferWidth
) * int64_t(rowBufferHeight
);
522 // need some limit, to avoid over-committing success from malloc, but then
523 // crashing when we try to actually use the memory.
524 // 100meg seems big enough to allow "normal" zoom factors and image sizes
525 // through while avoiding the crash seen by the bug (crbug.com/528628)
526 if (size
> 100 * 1024 * 1024) {
527 // printf_stderr("BGRAConvolve2D: tmp allocation [%lld] too
533 CircularRowBuffer
rowBuffer(rowBufferWidth
, rowBufferHeight
, filterOffset
);
534 if (!rowBuffer
.AllocBuffer()) {
538 // Loop over every possible output row, processing just enough horizontal
539 // convolutions to run each subsequent vertical convolution.
540 MOZ_ASSERT(outputByteRowStride
>= filterX
.numValues() * 4);
541 int numOutputRows
= filterY
.numValues();
543 // We need to check which is the last line to convolve before we advance 4
544 // lines in one iteration.
545 int lastFilterOffset
, lastFilterLength
;
546 filterY
.FilterForValue(numOutputRows
- 1, &lastFilterOffset
,
549 for (int outY
= 0; outY
< numOutputRows
; outY
++) {
550 filterValues
= filterY
.FilterForValue(outY
, &filterOffset
, &filterLength
);
552 // Generate output rows until we have enough to run the current filter.
553 while (nextXRow
< filterOffset
+ filterLength
) {
554 convolve_horizontally(
555 &sourceData
[(uint64_t)nextXRow
* sourceByteRowStride
], filterX
,
556 rowBuffer
.advanceRow(), sourceHasAlpha
);
560 // Compute where in the output image this row of final data will go.
561 unsigned char* curOutputRow
= &output
[(uint64_t)outY
* outputByteRowStride
];
563 // Get the list of rows that the circular buffer has, in order.
564 int firstRowInCircularBuffer
;
565 unsigned char* const* rowsToConvolve
=
566 rowBuffer
.GetRowAddresses(&firstRowInCircularBuffer
);
568 // Now compute the start of the subset of those rows that the filter needs.
569 unsigned char* const* firstRowForFilter
=
570 &rowsToConvolve
[filterOffset
- firstRowInCircularBuffer
];
572 convolve_vertically(filterValues
, filterLength
, firstRowForFilter
,
573 filterX
.numValues(), curOutputRow
, sourceHasAlpha
);