gfx/2d/ConvolutionFilterSSE2.cpp

   1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
   2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
   3 // Copyright (c) 2011-2016 Google Inc.
   4 // Use of this source code is governed by a BSD-style license that can be
   5 // found in the gfx/skia/LICENSE file.
   6
   7 #include "SkConvolver.h"
   8 #include "mozilla/Attributes.h"
   9 #include <immintrin.h>
  10
  11 namespace skia {
  12
  13 static MOZ_ALWAYS_INLINE void AccumRemainder(
  14     const unsigned char* pixelsLeft,
  15     const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum,
  16     int r) {
  17   int remainder[4] = {0};
  18   for (int i = 0; i < r; i++) {
  19     SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
  20     remainder[0] += coeff * pixelsLeft[i * 4 + 0];
  21     remainder[1] += coeff * pixelsLeft[i * 4 + 1];
  22     remainder[2] += coeff * pixelsLeft[i * 4 + 2];
  23     remainder[3] += coeff * pixelsLeft[i * 4 + 3];
  24   }
  25   __m128i t =
  26       _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
  27   accum = _mm_add_epi32(accum, t);
  28 }
  29
  30 // Convolves horizontally along a single row. The row data is given in
  31 // |srcData| and continues for the numValues() of the filter.
  32 void convolve_horizontally_sse2(const unsigned char* srcData,
  33                                 const SkConvolutionFilter1D& filter,
  34                                 unsigned char* outRow, bool /*hasAlpha*/) {
  35   // Output one pixel each iteration, calculating all channels (RGBA) together.
  36   int numValues = filter.numValues();
  37   for (int outX = 0; outX < numValues; outX++) {
  38     // Get the filter that determines the current output pixel.
  39     int filterOffset, filterLength;
  40     const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
  41         filter.FilterForValue(outX, &filterOffset, &filterLength);
  42
  43     // Compute the first pixel in this row that the filter affects. It will
  44     // touch |filterLength| pixels (4 bytes each) after this.
  45     const unsigned char* rowToFilter = &srcData[filterOffset * 4];
  46
  47     __m128i zero = _mm_setzero_si128();
  48     __m128i accum = _mm_setzero_si128();
  49
  50     // We will load and accumulate with four coefficients per iteration.
  51     for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
  52       // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
  53       __m128i coeff, coeff16;
  54       // [16] xx xx xx xx c3 c2 c1 c0
  55       coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
  56       // [16] xx xx xx xx c1 c1 c0 c0
  57       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
  58       // [16] c1 c1 c1 c1 c0 c0 c0 c0
  59       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
  60
  61       // Load four pixels => unpack the first two pixels to 16 bits =>
  62       // multiply with coefficients => accumulate the convolution result.
  63       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
  64       __m128i src8 =
  65           _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter));
  66       // [16] a1 b1 g1 r1 a0 b0 g0 r0
  67       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
  68       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
  69       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
  70       // [32]  a0*c0 b0*c0 g0*c0 r0*c0
  71       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
  72       accum = _mm_add_epi32(accum, t);
  73       // [32]  a1*c1 b1*c1 g1*c1 r1*c1
  74       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
  75       accum = _mm_add_epi32(accum, t);
  76
  77       // Duplicate 3rd and 4th coefficients for all channels =>
  78       // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
  79       // => accumulate the convolution results.
  80       // [16] xx xx xx xx c3 c3 c2 c2
  81       coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
  82       // [16] c3 c3 c3 c3 c2 c2 c2 c2
  83       coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
  84       // [16] a3 g3 b3 r3 a2 g2 b2 r2
  85       src16 = _mm_unpackhi_epi8(src8, zero);
  86       mul_hi = _mm_mulhi_epi16(src16, coeff16);
  87       mul_lo = _mm_mullo_epi16(src16, coeff16);
  88       // [32]  a2*c2 b2*c2 g2*c2 r2*c2
  89       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
  90       accum = _mm_add_epi32(accum, t);
  91       // [32]  a3*c3 b3*c3 g3*c3 r3*c3
  92       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
  93       accum = _mm_add_epi32(accum, t);
  94
  95       // Advance the pixel and coefficients pointers.
  96       rowToFilter += 16;
  97       filterValues += 4;
  98     }
  99
 100     // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
 101     // coefficients one at a time.
 102     int r = filterLength & 3;
 103     if (r) {
 104       int remainderOffset = (filterOffset + filterLength - r) * 4;
 105       AccumRemainder(srcData + remainderOffset, filterValues, accum, r);
 106     }
 107
 108     // Shift right for fixed point implementation.
 109     accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
 110
 111     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
 112     accum = _mm_packs_epi32(accum, zero);
 113     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
 114     accum = _mm_packus_epi16(accum, zero);
 115
 116     // Store the pixel value of 32 bits.
 117     *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
 118     outRow += 4;
 119   }
 120 }
 121
 122 // Does vertical convolution to produce one output row. The filter values and
 123 // length are given in the first two parameters. These are applied to each
 124 // of the rows pointed to in the |sourceDataRows| array, with each row
 125 // being |pixelWidth| wide.
 126 //
 127 // The output must have room for |pixelWidth * 4| bytes.
 128 template <bool hasAlpha>
 129 static void ConvolveVertically(
 130     const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
 131     int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
 132     unsigned char* outRow) {
 133   // Output four pixels per iteration (16 bytes).
 134   int width = pixelWidth & ~3;
 135   __m128i zero = _mm_setzero_si128();
 136   for (int outX = 0; outX < width; outX += 4) {
 137     // Accumulated result for each pixel. 32 bits per RGBA channel.
 138     __m128i accum0 = _mm_setzero_si128();
 139     __m128i accum1 = _mm_setzero_si128();
 140     __m128i accum2 = _mm_setzero_si128();
 141     __m128i accum3 = _mm_setzero_si128();
 142
 143     // Convolve with one filter coefficient per iteration.
 144     for (int filterY = 0; filterY < filterLength; filterY++) {
 145       // Duplicate the filter coefficient 8 times.
 146       // [16] cj cj cj cj cj cj cj cj
 147       __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
 148
 149       // Load four pixels (16 bytes) together.
 150       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
 151       const __m128i* src =
 152           reinterpret_cast<const __m128i*>(&sourceDataRows[filterY][outX << 2]);
 153       __m128i src8 = _mm_loadu_si128(src);
 154
 155       // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
 156       // multiply with current coefficient => accumulate the result.
 157       // [16] a1 b1 g1 r1 a0 b0 g0 r0
 158       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
 159       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
 160       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
 161       // [32] a0 b0 g0 r0
 162       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
 163       accum0 = _mm_add_epi32(accum0, t);
 164       // [32] a1 b1 g1 r1
 165       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
 166       accum1 = _mm_add_epi32(accum1, t);
 167
 168       // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
 169       // multiply with current coefficient => accumulate the result.
 170       // [16] a3 b3 g3 r3 a2 b2 g2 r2
 171       src16 = _mm_unpackhi_epi8(src8, zero);
 172       mul_hi = _mm_mulhi_epi16(src16, coeff16);
 173       mul_lo = _mm_mullo_epi16(src16, coeff16);
 174       // [32] a2 b2 g2 r2
 175       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
 176       accum2 = _mm_add_epi32(accum2, t);
 177       // [32] a3 b3 g3 r3
 178       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
 179       accum3 = _mm_add_epi32(accum3, t);
 180     }
 181
 182     // Shift right for fixed point implementation.
 183     accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
 184     accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
 185     accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
 186     accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
 187
 188     // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
 189     // [16] a1 b1 g1 r1 a0 b0 g0 r0
 190     accum0 = _mm_packs_epi32(accum0, accum1);
 191     // [16] a3 b3 g3 r3 a2 b2 g2 r2
 192     accum2 = _mm_packs_epi32(accum2, accum3);
 193
 194     // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
 195     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
 196     accum0 = _mm_packus_epi16(accum0, accum2);
 197
 198     if (hasAlpha) {
 199       // Compute the max(ri, gi, bi) for each pixel.
 200       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
 201       __m128i a = _mm_srli_epi32(accum0, 8);
 202       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
 203       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
 204       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
 205       a = _mm_srli_epi32(accum0, 16);
 206       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
 207       b = _mm_max_epu8(a, b);  // Max of r and g and b.
 208       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
 209       b = _mm_slli_epi32(b, 24);
 210
 211       // Make sure the value of alpha channel is always larger than maximum
 212       // value of color channels.
 213       accum0 = _mm_max_epu8(b, accum0);
 214     } else {
 215       // Set value of alpha channels to 0xFF.
 216       __m128i mask = _mm_set1_epi32(0xff000000);
 217       accum0 = _mm_or_si128(accum0, mask);
 218     }
 219
 220     // Store the convolution result (16 bytes) and advance the pixel pointers.
 221     _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
 222     outRow += 16;
 223   }
 224
 225   // When the width of the output is not divisible by 4, We need to save one
 226   // pixel (4 bytes) each time. And also the fourth pixel is always absent.
 227   int r = pixelWidth & 3;
 228   if (r) {
 229     __m128i accum0 = _mm_setzero_si128();
 230     __m128i accum1 = _mm_setzero_si128();
 231     __m128i accum2 = _mm_setzero_si128();
 232     for (int filterY = 0; filterY < filterLength; ++filterY) {
 233       __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
 234       // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
 235       const __m128i* src = reinterpret_cast<const __m128i*>(
 236           &sourceDataRows[filterY][width << 2]);
 237       __m128i src8 = _mm_loadu_si128(src);
 238       // [16] a1 b1 g1 r1 a0 b0 g0 r0
 239       __m128i src16 = _mm_unpacklo_epi8(src8, zero);
 240       __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
 241       __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
 242       // [32] a0 b0 g0 r0
 243       __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
 244       accum0 = _mm_add_epi32(accum0, t);
 245       // [32] a1 b1 g1 r1
 246       t = _mm_unpackhi_epi16(mul_lo, mul_hi);
 247       accum1 = _mm_add_epi32(accum1, t);
 248       // [16] a3 b3 g3 r3 a2 b2 g2 r2
 249       src16 = _mm_unpackhi_epi8(src8, zero);
 250       mul_hi = _mm_mulhi_epi16(src16, coeff16);
 251       mul_lo = _mm_mullo_epi16(src16, coeff16);
 252       // [32] a2 b2 g2 r2
 253       t = _mm_unpacklo_epi16(mul_lo, mul_hi);
 254       accum2 = _mm_add_epi32(accum2, t);
 255     }
 256
 257     accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
 258     accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
 259     accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
 260     // [16] a1 b1 g1 r1 a0 b0 g0 r0
 261     accum0 = _mm_packs_epi32(accum0, accum1);
 262     // [16] a3 b3 g3 r3 a2 b2 g2 r2
 263     accum2 = _mm_packs_epi32(accum2, zero);
 264     // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
 265     accum0 = _mm_packus_epi16(accum0, accum2);
 266     if (hasAlpha) {
 267       // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
 268       __m128i a = _mm_srli_epi32(accum0, 8);
 269       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
 270       __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
 271       // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
 272       a = _mm_srli_epi32(accum0, 16);
 273       // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
 274       b = _mm_max_epu8(a, b);  // Max of r and g and b.
 275       // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
 276       b = _mm_slli_epi32(b, 24);
 277       accum0 = _mm_max_epu8(b, accum0);
 278     } else {
 279       __m128i mask = _mm_set1_epi32(0xff000000);
 280       accum0 = _mm_or_si128(accum0, mask);
 281     }
 282
 283     for (int i = 0; i < r; i++) {
 284       *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
 285       accum0 = _mm_srli_si128(accum0, 4);
 286       outRow += 4;
 287     }
 288   }
 289 }
 290
 291 void convolve_vertically_sse2(
 292     const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
 293     int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
 294     unsigned char* outRow, bool hasAlpha) {
 295   if (hasAlpha) {
 296     ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
 297                              pixelWidth, outRow);
 298   } else {
 299     ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
 300                               pixelWidth, outRow);
 301   }
 302 }
 303
 304 }  // namespace skia