gfx/ycbcr/yuv_row_win64.cpp

   1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
   2 // Use of this source code is governed by a BSD-style license that can be
   3 // found in the LICENSE file.
   4
   5 #include "yuv_row.h"
   6
   7 extern "C" {
   8
   9 // x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
  10
  11 #define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)
  12 #define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)
  13
  14 #include <emmintrin.h>
  15
  16 static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf,
  17                                           const uint8_t* u_buf,
  18                                           const uint8_t* v_buf,
  19                                           uint8_t* rgb_buf,
  20                                           int width) {
  21   __m128i xmm0, xmmY1, xmmY2;
  22   __m128  xmmY;
  23
  24   while (width >= 2) {
  25     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
  26                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
  27
  28     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
  29     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
  30
  31     xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
  32     xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
  33
  34     xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
  35                           0x44);
  36     xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
  37     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
  38
  39     _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
  40     rgb_buf += 8;
  41     width -= 2;
  42   }
  43
  44   if (width) {
  45     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
  46                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
  47     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf));
  48     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
  49     xmmY1 = _mm_srai_epi16(xmmY1, 6);
  50     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
  51     *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
  52   }
  53 }
  54
  55 static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
  56                                     const uint8_t* u_buf,
  57                                     const uint8_t* v_buf,
  58                                     uint8_t* rgb_buf,
  59                                     int width,
  60                                     int source_dx) {
  61   __m128i xmm0, xmmY1, xmmY2;
  62   __m128  xmmY;
  63   uint8_t u, v, y;
  64   int x = 0;
  65
  66   while (width >= 2) {
  67     u = u_buf[x >> 17];
  68     v = v_buf[x >> 17];
  69     y = y_buf[x >> 16];
  70     x += source_dx;
  71
  72     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
  73                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
  74     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
  75     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
  76
  77     y = y_buf[x >> 16];
  78     x += source_dx;
  79
  80     xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
  81     xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
  82
  83     xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
  84                           0x44);
  85     xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
  86     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
  87
  88     _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
  89     rgb_buf += 8;
  90     width -= 2;
  91   }
  92
  93   if (width) {
  94     u = u_buf[x >> 17];
  95     v = v_buf[x >> 17];
  96     y = y_buf[x >> 16];
  97
  98     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
  99                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
 100     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
 101     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
 102     xmmY1 = _mm_srai_epi16(xmmY1, 6);
 103     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
 104     *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
 105   }
 106 }
 107
 108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
 109                                           const uint8_t* u_buf,
 110                                           const uint8_t* v_buf,
 111                                           uint8_t* rgb_buf,
 112                                           int width,
 113                                           int source_dx) {
 114   __m128i xmm0, xmmY1, xmmY2;
 115   __m128  xmmY;
 116   uint8_t u0, u1, v0, v1, y0, y1;
 117   uint32_t uv_frac, y_frac, u, v, y;
 118   int x = 0;
 119
 120   if (source_dx >= 0x20000) {
 121     x = 32768;
 122   }
 123
 124   while(width >= 2) {
 125     u0 = u_buf[x >> 17];
 126     u1 = u_buf[(x >> 17) + 1];
 127     v0 = v_buf[x >> 17];
 128     v1 = v_buf[(x >> 17) + 1];
 129     y0 = y_buf[x >> 16];
 130     y1 = y_buf[(x >> 16) + 1];
 131     uv_frac = (x & 0x1fffe);
 132     y_frac = (x & 0xffff);
 133     u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
 134     v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
 135     y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
 136     x += source_dx;
 137
 138     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
 139                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
 140     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
 141     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
 142
 143     y0 = y_buf[x >> 16];
 144     y1 = y_buf[(x >> 16) + 1];
 145     y_frac = (x & 0xffff);
 146     y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
 147     x += source_dx;
 148
 149     xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
 150     xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
 151
 152     xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
 153                           0x44);
 154     xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
 155     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
 156
 157     _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
 158     rgb_buf += 8;
 159     width -= 2;
 160   }
 161
 162   if (width) {
 163     u = u_buf[x >> 17];
 164     v = v_buf[x >> 17];
 165     y = y_buf[x >> 16];
 166
 167     xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
 168                           _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
 169     xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
 170
 171     xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
 172     xmmY1 = _mm_srai_epi16(xmmY1, 6);
 173     xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
 174     *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
 175   }
 176 }
 177
 178 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
 179                               const uint8_t* u_buf,
 180                               const uint8_t* v_buf,
 181                               uint8_t* rgb_buf,
 182                               int width) {
 183   FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
 184 }
 185
 186 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
 187                         const uint8_t* u_buf,
 188                         const uint8_t* v_buf,
 189                         uint8_t* rgb_buf,
 190                         int width,
 191                         int source_dx) {
 192   ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
 193 }
 194
 195 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
 196                               const uint8_t* u_buf,
 197                               const uint8_t* v_buf,
 198                               uint8_t* rgb_buf,
 199                               int width,
 200                               int source_dx) {
 201   LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
 202                                 source_dx);
 203 }
 204
 205 } // extern "C"