gfx/ycbcr/convert.patch

   1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
   2 --- a/gfx/ycbcr/yuv_convert.cpp
   3 +++ b/gfx/ycbcr/yuv_convert.cpp
   4 @@ -6,145 +6,102 @@
   5  // http://www.fourcc.org/yuv.php
   6  // The actual conversion is best described here
   7  // http://en.wikipedia.org/wiki/YUV
   8  // An article on optimizing YUV conversion using tables instead of multiplies
   9  // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
  10  //
  11  // YV12 is a full plane of Y and a half height, half width chroma planes
  12  // YV16 is a full plane of Y and a full height, half width chroma planes
  13 +// YV24 is a full plane of Y and a full height, full width chroma planes
  14  //
  15  // ARGB pixel format is output, which on little endian is stored as BGRA.
  16  // The alpha is set to 255, allowing the application to use RGBA or RGB32.
  17
  18 -#include "media/base/yuv_convert.h"
  19 +#include "yuv_convert.h"
  20
  21  // Header for low level row functions.
  22 -#include "media/base/yuv_row.h"
  23 -
  24 -#if USE_MMX
  25 -#if defined(_MSC_VER)
  26 -#include <intrin.h>
  27 -#else
  28 -#include <mmintrin.h>
  29 -#endif
  30 -#endif
  31 -
  32 -#if USE_SSE2
  33 -#include <emmintrin.h>
  34 -#endif
  35 -
  36 -namespace media {
  37 -
  38 +#include "yuv_row.h"
  39 +#include "mozilla/SSE.h"
  40 +
  41 +namespace mozilla {
  42 +
  43 +namespace gfx {
  44 +
  45  // 16.16 fixed point arithmetic
  46  const int kFractionBits = 16;
  47  const int kFractionMax = 1 << kFractionBits;
  48  const int kFractionMask = ((1 << kFractionBits) - 1);
  49
  50  // Convert a frame of YUV to 32 bit ARGB.
  51 -void ConvertYUVToRGB32(const uint8* y_buf,
  52 -                       const uint8* u_buf,
  53 -                       const uint8* v_buf,
  54 -                       uint8* rgb_buf,
  55 -                       int width,
  56 -                       int height,
  57 -                       int y_pitch,
  58 -                       int uv_pitch,
  59 -                       int rgb_pitch,
  60 -                       YUVType yuv_type) {
  61 -  unsigned int y_shift = yuv_type;
  62 -  for (int y = 0; y < height; ++y) {
  63 -    uint8* rgb_row = rgb_buf + y * rgb_pitch;
  64 -    const uint8* y_ptr = y_buf + y * y_pitch;
  65 -    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
  66 -    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
  67 -
  68 -    FastConvertYUVToRGB32Row(y_ptr,
  69 -                             u_ptr,
  70 -                             v_ptr,
  71 -                             rgb_row,
  72 -                             width);
  73 -  }
  74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
  75 +                                  const uint8* u_buf,
  76 +                                  const uint8* v_buf,
  77 +                                  uint8* rgb_buf,
  78 +                                  int pic_x,
  79 +                                  int pic_y,
  80 +                                  int pic_width,
  81 +                                  int pic_height,
  82 +                                  int y_pitch,
  83 +                                  int uv_pitch,
  84 +                                  int rgb_pitch,
  85 +                                  YUVType yuv_type) {
  86 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
  87 +  unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
  88 +  // Test for SSE because the optimized code uses movntq, which is not part of MMX.
  89 +  bool has_sse = supports_mmx() && supports_sse();
  90 +  // There is no optimized YV24 SSE routine so we check for this and
  91 +  // fall back to the C code.
  92 +  has_sse &= yuv_type != YV24;
  93 +  bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
  94 +  int x_width = odd_pic_x ? pic_width - 1 : pic_width;
  95 +
  96 +  for (int y = pic_y; y < pic_height + pic_y; ++y) {
  97 +    uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
  98 +    const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
  99 +    const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
 100 +    const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
 101 +
 102 +    if (odd_pic_x) {
 103 +      // Handle the single odd pixel manually and use the
 104 +      // fast routines for the remaining.
 105 +      FastConvertYUVToRGB32Row_C(y_ptr++,
 106 +                                 u_ptr++,
 107 +                                 v_ptr++,
 108 +                                 rgb_row,
 109 +                                 1,
 110 +                                 x_shift);
 111 +      rgb_row += 4;
 112 +    }
 113 +
 114 +    if (has_sse) {
 115 +      FastConvertYUVToRGB32Row(y_ptr,
 116 +                               u_ptr,
 117 +                               v_ptr,
 118 +                               rgb_row,
 119 +                               x_width);
 120 +    }
 121 +    else {
 122 +      FastConvertYUVToRGB32Row_C(y_ptr,
 123 +                                 u_ptr,
 124 +                                 v_ptr,
 125 +                                 rgb_row,
 126 +                                 x_width,
 127 +                                 x_shift);
 128 +    }
 129 +  }
 130
 131    // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
 132 -  EMMS();
 133 -}
 134 -
 135 -#if USE_SSE2
 136 -// FilterRows combines two rows of the image using linear interpolation.
 137 -// SSE2 version does 16 pixels at a time
 138 -
 139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 140 -                       int source_width, int source_y_fraction) {
 141 -  __m128i zero = _mm_setzero_si128();
 142 -  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
 143 -  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
 144 -
 145 -  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
 146 -  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
 147 -  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
 148 -  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
 149 -
 150 -  do {
 151 -    __m128i y0 = _mm_loadu_si128(y0_ptr128);
 152 -    __m128i y1 = _mm_loadu_si128(y1_ptr128);
 153 -    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
 154 -    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
 155 -    y0 = _mm_unpacklo_epi8(y0, zero);
 156 -    y1 = _mm_unpacklo_epi8(y1, zero);
 157 -    y0 = _mm_mullo_epi16(y0, y0_fraction);
 158 -    y1 = _mm_mullo_epi16(y1, y1_fraction);
 159 -    y2 = _mm_mullo_epi16(y2, y0_fraction);
 160 -    y3 = _mm_mullo_epi16(y3, y1_fraction);
 161 -    y0 = _mm_add_epi16(y0, y1);
 162 -    y2 = _mm_add_epi16(y2, y3);
 163 -    y0 = _mm_srli_epi16(y0, 8);
 164 -    y2 = _mm_srli_epi16(y2, 8);
 165 -    y0 = _mm_packus_epi16(y0, y2);
 166 -    *dest128++ = y0;
 167 -    ++y0_ptr128;
 168 -    ++y1_ptr128;
 169 -  } while (dest128 < end128);
 170 -}
 171 -#elif USE_MMX
 172 -// MMX version does 8 pixels at a time
 173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 174 -                       int source_width, int source_y_fraction) {
 175 -  __m64 zero = _mm_setzero_si64();
 176 -  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
 177 -  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
 178 -
 179 -  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
 180 -  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
 181 -  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
 182 -  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
 183 -
 184 -  do {
 185 -    __m64 y0 = *y0_ptr64++;
 186 -    __m64 y1 = *y1_ptr64++;
 187 -    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
 188 -    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
 189 -    y0 = _mm_unpacklo_pi8(y0, zero);
 190 -    y1 = _mm_unpacklo_pi8(y1, zero);
 191 -    y0 = _mm_mullo_pi16(y0, y0_fraction);
 192 -    y1 = _mm_mullo_pi16(y1, y1_fraction);
 193 -    y2 = _mm_mullo_pi16(y2, y0_fraction);
 194 -    y3 = _mm_mullo_pi16(y3, y1_fraction);
 195 -    y0 = _mm_add_pi16(y0, y1);
 196 -    y2 = _mm_add_pi16(y2, y3);
 197 -    y0 = _mm_srli_pi16(y0, 8);
 198 -    y2 = _mm_srli_pi16(y2, 8);
 199 -    y0 = _mm_packs_pu16(y0, y2);
 200 -    *dest64++ = y0;
 201 -  } while (dest64 < end64);
 202 -}
 203 -#else  // no MMX or SSE2
 204 +  if (has_sse)
 205 +    EMMS();
 206 +}
 207 +
 208  // C version does 8 at a time to mimic MMX code
 209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 210 -                       int source_width, int source_y_fraction) {
 211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 212 +                         int source_width, int source_y_fraction) {
 213    int y1_fraction = source_y_fraction;
 214    int y0_fraction = 256 - y1_fraction;
 215    uint8* end = ybuf + source_width;
 216    do {
 217      ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
 218      ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
 219      ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
 220      ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
 221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
 222      ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
 223      ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
 224      ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
 225      y0_ptr += 8;
 226      y1_ptr += 8;
 227      ybuf += 8;
 228    } while (ybuf < end);
 229  }
 230 -#endif
 231 +
 232 +#ifdef MOZILLA_MAY_SUPPORT_MMX
 233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 234 +                    int source_width, int source_y_fraction);
 235 +#endif
 236 +
 237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
 238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 239 +                     int source_width, int source_y_fraction);
 240 +#endif
 241 +
 242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
 243 +                              const uint8* y1_ptr, int source_width,
 244 +                              int source_y_fraction) {
 245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
 246 +  if (mozilla::supports_sse2()) {
 247 +    FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 248 +    return;
 249 +  }
 250 +#endif
 251 +
 252 +#ifdef MOZILLA_MAY_SUPPORT_MMX
 253 +  if (mozilla::supports_mmx()) {
 254 +    FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 255 +    return;
 256 +  }
 257 +#endif
 258 +
 259 +  FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 260 +}
 261
 262
 263  // Scale a frame of YUV to 32 bit ARGB.
 264 -void ScaleYUVToRGB32(const uint8* y_buf,
 265 -                     const uint8* u_buf,
 266 -                     const uint8* v_buf,
 267 -                     uint8* rgb_buf,
 268 -                     int source_width,
 269 -                     int source_height,
 270 -                     int width,
 271 -                     int height,
 272 -                     int y_pitch,
 273 -                     int uv_pitch,
 274 -                     int rgb_pitch,
 275 -                     YUVType yuv_type,
 276 -                     Rotate view_rotate,
 277 -                     ScaleFilter filter) {
 278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
 279 +                                const uint8* u_buf,
 280 +                                const uint8* v_buf,
 281 +                                uint8* rgb_buf,
 282 +                                int source_width,
 283 +                                int source_height,
 284 +                                int width,
 285 +                                int height,
 286 +                                int y_pitch,
 287 +                                int uv_pitch,
 288 +                                int rgb_pitch,
 289 +                                YUVType yuv_type,
 290 +                                Rotate view_rotate,
 291 +                                ScaleFilter filter) {
 292 +  bool has_mmx = supports_mmx();
 293 +
 294    // 4096 allows 3 buffers to fit in 12k.
 295    // Helps performance on CPU with 16K L1 cache.
 296    // Large enough for 3830x2160 and 30" displays which are 2560x1600.
 297    const int kFilterBufferSize = 4096;
 298    // Disable filtering if the screen is too big (to avoid buffer overflows).
 299    // This should never happen to regular users: they don't have monitors
 300    // wider than 4096 pixels.
 301    // TODO(fbarchard): Allow rotated videos to filter.
 302    if (source_width > kFilterBufferSize || view_rotate)
 303      filter = FILTER_NONE;
 304
 305 -  unsigned int y_shift = yuv_type;
 306 +  unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
 307    // Diagram showing origin and direction of source sampling.
 308    // ->0   4<-
 309    // 7       3
 310    //
 311    // 6       5
 312    // ->1   2<-
 313    // Rotations that start at right side of image.
 314    if ((view_rotate == ROTATE_180) ||
 315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
 316      int source_uv_fraction =
 317          ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
 318
 319      const uint8* y_ptr = y0_ptr;
 320      const uint8* u_ptr = u0_ptr;
 321      const uint8* v_ptr = v0_ptr;
 322      // Apply vertical filtering if necessary.
 323      // TODO(fbarchard): Remove memcpy when not necessary.
 324 -    if (filter & media::FILTER_BILINEAR_V) {
 325 +    if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
 326        if (yscale_fixed != kFractionMax &&
 327            source_y_fraction && ((source_y + 1) < source_height)) {
 328          FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
 329        } else {
 330          memcpy(ybuf, y0_ptr, source_width);
 331        }
 332        y_ptr = ybuf;
 333        ybuf[source_width] = ybuf[source_width-1];
 334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
 335        u_ptr = ubuf;
 336        v_ptr = vbuf;
 337        ubuf[uv_source_width] = ubuf[uv_source_width - 1];
 338        vbuf[uv_source_width] = vbuf[uv_source_width - 1];
 339      }
 340      if (source_dx == kFractionMax) {  // Not scaled
 341        FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 342                                 dest_pixel, width);
 343 -    } else {
 344 -      if (filter & FILTER_BILINEAR_H) {
 345 +    } else if (filter & FILTER_BILINEAR_H) {
 346          LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 347                                   dest_pixel, width, source_dx);
 348      } else {
 349  // Specialized scalers and rotation.
 350 -#if USE_MMX && defined(_MSC_VER)
 351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
 352 +      if(mozilla::supports_sse()) {
 353          if (width == (source_width * 2)) {
 354 -          DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 355 -                              dest_pixel, width);
 356 +          DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 357 +                                  dest_pixel, width);
 358          } else if ((source_dx & kFractionMask) == 0) {
 359            // Scaling by integer scale factor. ie half.
 360 -          ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 361 -                               dest_pixel, width,
 362 -                               source_dx >> kFractionBits);
 363 +          ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 364 +                                   dest_pixel, width,
 365 +                                   source_dx >> kFractionBits);
 366          } else if (source_dx_uv == source_dx) {  // Not rotated.
 367            ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 368                               dest_pixel, width, source_dx);
 369          } else {
 370 -          RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 371 -                                     dest_pixel, width,
 372 -                                     source_dx >> kFractionBits,
 373 -                                     source_dx_uv >> kFractionBits);
 374 +          RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
 375 +                                         dest_pixel, width,
 376 +                                         source_dx >> kFractionBits,
 377 +                                         source_dx_uv >> kFractionBits);
 378          }
 379 +      }
 380 +      else {
 381 +        ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
 382 +                             dest_pixel, width, source_dx);
 383 +      }
 384  #else
 385 -        ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 386 -                           dest_pixel, width, source_dx);
 387 -#endif
 388 -      }
 389 +      ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
 390 +                         dest_pixel, width, source_dx);
 391 +#endif
 392      }
 393    }
 394    // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
 395 -  EMMS();
 396 -}
 397 -
 398 -}  // namespace media
 399 +  if (has_mmx)
 400 +    EMMS();
 401 +}
 402 +
 403 +}  // namespace gfx
 404 +}  // namespace mozilla
 405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
 406 --- a/gfx/ycbcr/yuv_convert.h
 407 +++ b/gfx/ycbcr/yuv_convert.h
 408 @@ -1,72 +1,79 @@
 409  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 410  // Use of this source code is governed by a BSD-style license that can be
 411  // found in the LICENSE file.
 412
 413  #ifndef MEDIA_BASE_YUV_CONVERT_H_
 414  #define MEDIA_BASE_YUV_CONVERT_H_
 415
 416 -#include "base/basictypes.h"
 417 -
 418 -namespace media {
 419 -
 420 +#include "chromium_types.h"
 421 +#include "gfxCore.h"
 422 +
 423 +namespace mozilla {
 424 +
 425 +namespace gfx {
 426 +
 427  // Type of YUV surface.
 428  // The value of these enums matter as they are used to shift vertical indices.
 429  enum YUVType {
 430 -  YV16 = 0,           // YV16 is half width and full height chroma channels.
 431 -  YV12 = 1,           // YV12 is half width and half height chroma channels.
 432 +  YV12 = 0,           // YV12 is half width and half height chroma channels.
 433 +  YV16 = 1,           // YV16 is half width and full height chroma channels.
 434 +  YV24 = 2            // YV24 is full width and full height chroma channels.
 435  };
 436
 437  // Mirror means flip the image horizontally, as in looking in a mirror.
 438  // Rotate happens after mirroring.
 439  enum Rotate {
 440    ROTATE_0,           // Rotation off.
 441    ROTATE_90,          // Rotate clockwise.
 442    ROTATE_180,         // Rotate upside down.
 443    ROTATE_270,         // Rotate counter clockwise.
 444    MIRROR_ROTATE_0,    // Mirror horizontally.
 445    MIRROR_ROTATE_90,   // Mirror then Rotate clockwise.
 446    MIRROR_ROTATE_180,  // Mirror vertically.
 447 -  MIRROR_ROTATE_270,  // Transpose.
 448 +  MIRROR_ROTATE_270   // Transpose.
 449  };
 450
 451  // Filter affects how scaling looks.
 452  enum ScaleFilter {
 453    FILTER_NONE = 0,        // No filter (point sampled).
 454    FILTER_BILINEAR_H = 1,  // Bilinear horizontal filter.
 455    FILTER_BILINEAR_V = 2,  // Bilinear vertical filter.
 456 -  FILTER_BILINEAR = 3,    // Bilinear filter.
 457 +  FILTER_BILINEAR = 3     // Bilinear filter.
 458  };
 459
 460  // Convert a frame of YUV to 32 bit ARGB.
 461  // Pass in YV16/YV12 depending on source format
 462 -void ConvertYUVToRGB32(const uint8* yplane,
 463 -                       const uint8* uplane,
 464 -                       const uint8* vplane,
 465 -                       uint8* rgbframe,
 466 -                       int width,
 467 -                       int height,
 468 -                       int ystride,
 469 -                       int uvstride,
 470 -                       int rgbstride,
 471 -                       YUVType yuv_type);
 472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
 473 +                                  const uint8* uplane,
 474 +                                  const uint8* vplane,
 475 +                                  uint8* rgbframe,
 476 +                                  int pic_x,
 477 +                                  int pic_y,
 478 +                                  int pic_width,
 479 +                                  int pic_height,
 480 +                                  int ystride,
 481 +                                  int uvstride,
 482 +                                  int rgbstride,
 483 +                                  YUVType yuv_type);
 484
 485  // Scale a frame of YUV to 32 bit ARGB.
 486  // Supports rotation and mirroring.
 487 -void ScaleYUVToRGB32(const uint8* yplane,
 488 -                     const uint8* uplane,
 489 -                     const uint8* vplane,
 490 -                     uint8* rgbframe,
 491 -                     int source_width,
 492 -                     int source_height,
 493 -                     int width,
 494 -                     int height,
 495 -                     int ystride,
 496 -                     int uvstride,
 497 -                     int rgbstride,
 498 -                     YUVType yuv_type,
 499 -                     Rotate view_rotate,
 500 -                     ScaleFilter filter);
 501 -
 502 -}  // namespace media
 503 -
 504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
 505 +                                const uint8* uplane,
 506 +                                const uint8* vplane,
 507 +                                uint8* rgbframe,
 508 +                                int source_width,
 509 +                                int source_height,
 510 +                                int width,
 511 +                                int height,
 512 +                                int ystride,
 513 +                                int uvstride,
 514 +                                int rgbstride,
 515 +                                YUVType yuv_type,
 516 +                                Rotate view_rotate,
 517 +                                ScaleFilter filter);
 518 +
 519 +}  // namespace gfx
 520 +}  // namespace mozilla
 521 +
 522  #endif  // MEDIA_BASE_YUV_CONVERT_H_
 523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
 524 new file mode 100644
 525 --- /dev/null
 526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
 527 @@ -0,0 +1,45 @@
 528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 529 +// Use of this source code is governed by a BSD-style license that can be
 530 +// found in the LICENSE file.
 531 +
 532 +#include <mmintrin.h>
 533 +#include "yuv_row.h"
 534 +
 535 +namespace mozilla {
 536 +namespace gfx {
 537 +
 538 +// FilterRows combines two rows of the image using linear interpolation.
 539 +// MMX version does 8 pixels at a time.
 540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 541 +                    int source_width, int source_y_fraction) {
 542 +  __m64 zero = _mm_setzero_si64();
 543 +  __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
 544 +  __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
 545 +
 546 +  const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
 547 +  const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
 548 +  __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
 549 +  __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
 550 +
 551 +  do {
 552 +    __m64 y0 = *y0_ptr64++;
 553 +    __m64 y1 = *y1_ptr64++;
 554 +    __m64 y2 = _mm_unpackhi_pi8(y0, zero);
 555 +    __m64 y3 = _mm_unpackhi_pi8(y1, zero);
 556 +    y0 = _mm_unpacklo_pi8(y0, zero);
 557 +    y1 = _mm_unpacklo_pi8(y1, zero);
 558 +    y0 = _mm_mullo_pi16(y0, y0_fraction);
 559 +    y1 = _mm_mullo_pi16(y1, y1_fraction);
 560 +    y2 = _mm_mullo_pi16(y2, y0_fraction);
 561 +    y3 = _mm_mullo_pi16(y3, y1_fraction);
 562 +    y0 = _mm_add_pi16(y0, y1);
 563 +    y2 = _mm_add_pi16(y2, y3);
 564 +    y0 = _mm_srli_pi16(y0, 8);
 565 +    y2 = _mm_srli_pi16(y2, 8);
 566 +    y0 = _mm_packs_pu16(y0, y2);
 567 +    *dest64++ = y0;
 568 +  } while (dest64 < end64);
 569 +}
 570 +
 571 +}
 572 +}
 573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
 574 new file mode 100644
 575 --- /dev/null
 576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
 577 @@ -0,0 +1,47 @@
 578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
 579 +// Use of this source code is governed by a BSD-style license that can be
 580 +// found in the LICENSE file.
 581 +
 582 +#include <emmintrin.h>
 583 +#include "yuv_row.h"
 584 +
 585 +namespace mozilla {
 586 +namespace gfx {
 587 +
 588 +// FilterRows combines two rows of the image using linear interpolation.
 589 +// SSE2 version does 16 pixels at a time.
 590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
 591 +                     int source_width, int source_y_fraction) {
 592 +  __m128i zero = _mm_setzero_si128();
 593 +  __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
 594 +  __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
 595 +
 596 +  const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
 597 +  const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
 598 +  __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
 599 +  __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
 600 +
 601 +  do {
 602 +    __m128i y0 = _mm_loadu_si128(y0_ptr128);
 603 +    __m128i y1 = _mm_loadu_si128(y1_ptr128);
 604 +    __m128i y2 = _mm_unpackhi_epi8(y0, zero);
 605 +    __m128i y3 = _mm_unpackhi_epi8(y1, zero);
 606 +    y0 = _mm_unpacklo_epi8(y0, zero);
 607 +    y1 = _mm_unpacklo_epi8(y1, zero);
 608 +    y0 = _mm_mullo_epi16(y0, y0_fraction);
 609 +    y1 = _mm_mullo_epi16(y1, y1_fraction);
 610 +    y2 = _mm_mullo_epi16(y2, y0_fraction);
 611 +    y3 = _mm_mullo_epi16(y3, y1_fraction);
 612 +    y0 = _mm_add_epi16(y0, y1);
 613 +    y2 = _mm_add_epi16(y2, y3);
 614 +    y0 = _mm_srli_epi16(y0, 8);
 615 +    y2 = _mm_srli_epi16(y2, 8);
 616 +    y0 = _mm_packus_epi16(y0, y2);
 617 +    *dest128++ = y0;
 618 +    ++y0_ptr128;
 619 +    ++y1_ptr128;
 620 +  } while (dest128 < end128);
 621 +}
 622 +
 623 +}
 624 +}
 625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
 626 --- a/gfx/ycbcr/yuv_row.h
 627 +++ b/gfx/ycbcr/yuv_row.h
 628 @@ -5,109 +5,133 @@
 629  // yuv_row internal functions to handle YUV conversion and scaling to RGB.
 630  // These functions are used from both yuv_convert.cc and yuv_scale.cc.
 631
 632  // TODO(fbarchard): Write function that can handle rotation and scaling.
 633
 634  #ifndef MEDIA_BASE_YUV_ROW_H_
 635  #define MEDIA_BASE_YUV_ROW_H_
 636
 637 -#include "base/basictypes.h"
 638 +#include "chromium_types.h"
 639
 640  extern "C" {
 641  // Can only do 1x.
 642  // This is the second fastest of the scalers.
 643  void FastConvertYUVToRGB32Row(const uint8* y_buf,
 644                                const uint8* u_buf,
 645                                const uint8* v_buf,
 646                                uint8* rgb_buf,
 647                                int width);
 648
 649 -// Can do 1x, half size or any scale down by an integer amount.
 650 -// Step can be negative (mirroring, rotate 180).
 651 -// This is the third fastest of the scalers.
 652 -void ConvertYUVToRGB32Row(const uint8* y_buf,
 653 -                          const uint8* u_buf,
 654 -                          const uint8* v_buf,
 655 -                          uint8* rgb_buf,
 656 -                          int width,
 657 -                          int step);
 658 -
 659 -// Rotate is like Convert, but applies different step to Y versus U and V.
 660 -// This allows rotation by 90 or 270, by stepping by stride.
 661 -// This is the forth fastest of the scalers.
 662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
 663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
 664                                  const uint8* u_buf,
 665                                  const uint8* v_buf,
 666                                  uint8* rgb_buf,
 667                                  int width,
 668 -                                int ystep,
 669 -                                int uvstep);
 670 +                                unsigned int x_shift);
 671 +
 672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
 673 +                              const uint8* u_buf,
 674 +                              const uint8* v_buf,
 675 +                              uint8* rgb_buf,
 676 +                              int width);
 677 +
 678 +// Can do 1x, half size or any scale down by an integer amount.
 679 +// Step can be negative (mirroring, rotate 180).
 680 +// This is the third fastest of the scalers.
 681 +// Only defined on Windows x86-32.
 682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
 683 +                              const uint8* u_buf,
 684 +                              const uint8* v_buf,
 685 +                              uint8* rgb_buf,
 686 +                              int width,
 687 +                              int step);
 688 +
 689 +// Rotate is like Convert, but applies different step to Y versus U and V.
 690 +// This allows rotation by 90 or 270, by stepping by stride.
 691 +// This is the forth fastest of the scalers.
 692 +// Only defined on Windows x86-32.
 693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
 694 +                                    const uint8* u_buf,
 695 +                                    const uint8* v_buf,
 696 +                                    uint8* rgb_buf,
 697 +                                    int width,
 698 +                                    int ystep,
 699 +                                    int uvstep);
 700
 701  // Doubler does 4 pixels at a time.  Each pixel is replicated.
 702  // This is the fastest of the scalers.
 703 -void DoubleYUVToRGB32Row(const uint8* y_buf,
 704 -                         const uint8* u_buf,
 705 -                         const uint8* v_buf,
 706 -                         uint8* rgb_buf,
 707 -                         int width);
 708 +// Only defined on Windows x86-32.
 709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
 710 +                             const uint8* u_buf,
 711 +                             const uint8* v_buf,
 712 +                             uint8* rgb_buf,
 713 +                             int width);
 714
 715  // Handles arbitrary scaling up or down.
 716  // Mirroring is supported, but not 90 or 270 degree rotation.
 717  // Chroma is under sampled every 2 pixels for performance.
 718  void ScaleYUVToRGB32Row(const uint8* y_buf,
 719                          const uint8* u_buf,
 720                          const uint8* v_buf,
 721                          uint8* rgb_buf,
 722                          int width,
 723                          int source_dx);
 724
 725 +void ScaleYUVToRGB32Row(const uint8* y_buf,
 726 +                        const uint8* u_buf,
 727 +                        const uint8* v_buf,
 728 +                        uint8* rgb_buf,
 729 +                        int width,
 730 +                        int source_dx);
 731 +
 732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
 733 +                          const uint8* u_buf,
 734 +                          const uint8* v_buf,
 735 +                          uint8* rgb_buf,
 736 +                          int width,
 737 +                          int source_dx);
 738 +
 739  // Handles arbitrary scaling up or down with bilinear filtering.
 740  // Mirroring is supported, but not 90 or 270 degree rotation.
 741  // Chroma is under sampled every 2 pixels for performance.
 742  // This is the slowest of the scalers.
 743  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 744                                const uint8* u_buf,
 745                                const uint8* v_buf,
 746                                uint8* rgb_buf,
 747                                int width,
 748                                int source_dx);
 749
 750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 751 +                              const uint8* u_buf,
 752 +                              const uint8* v_buf,
 753 +                              uint8* rgb_buf,
 754 +                              int width,
 755 +                              int source_dx);
 756 +
 757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
 758 +                                const uint8* u_buf,
 759 +                                const uint8* v_buf,
 760 +                                uint8* rgb_buf,
 761 +                                int width,
 762 +                                int source_dx);
 763 +
 764 +
 765  #if defined(_MSC_VER)
 766  #define SIMD_ALIGNED(var) __declspec(align(16)) var
 767  #else
 768  #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
 769  #endif
 770  extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
 771
 772 -// Method to force C version.
 773 -//#define USE_MMX 0
 774 -//#define USE_SSE2 0
 775 -
 776 -#if !defined(USE_MMX)
 777 -// Windows, Mac and Linux/BSD use MMX
 778 -#if defined(__MMX__) || defined(_MSC_VER)
 779 -#define USE_MMX 1
 780 -#else
 781 -#define USE_MMX 0
 782 -#endif
 783 -#endif
 784 -
 785 -#if !defined(USE_SSE2)
 786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
 787 -#define USE_SSE2 1
 788 -#else
 789 -#define USE_SSE2 0
 790 -#endif
 791 -#endif
 792 -
 793  // x64 uses MMX2 (SSE) so emms is not required.
 794  // Warning C4799: function has no EMMS instruction.
 795  // EMMS() is slow and should be called by the calling function once per image.
 796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
 797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
 798  #if defined(_MSC_VER)
 799  #define EMMS() __asm emms
 800  #pragma warning(disable: 4799)
 801  #else
 802  #define EMMS() asm("emms")
 803  #endif
 804  #else
 805  #define EMMS()
 806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
 807 --- a/gfx/ycbcr/yuv_row_c.cpp
 808 +++ b/gfx/ycbcr/yuv_row_c.cpp
 809 @@ -1,812 +1,18 @@
 810  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
 811  // Use of this source code is governed by a BSD-style license that can be
 812  // found in the LICENSE file.
 813
 814 -#include "media/base/yuv_row.h"
 815 -
 816 -#ifdef _DEBUG
 817 -#include "base/logging.h"
 818 -#else
 819 +#include "yuv_row.h"
 820 +
 821  #define DCHECK(a)
 822 -#endif
 823
 824  extern "C" {
 825
 826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
 827 -
 828 -// AMD64 ABI uses register paremters.
 829 -void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
 830 -                              const uint8* u_buf,  // rsi
 831 -                              const uint8* v_buf,  // rdx
 832 -                              uint8* rgb_buf,      // rcx
 833 -                              int width) {         // r8
 834 -  asm(
 835 -  "jmp    convertend\n"
 836 -"convertloop:"
 837 -  "movzb  (%1),%%r10\n"
 838 -  "add    $0x1,%1\n"
 839 -  "movzb  (%2),%%r11\n"
 840 -  "add    $0x1,%2\n"
 841 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
 842 -  "movzb  (%0),%%r10\n"
 843 -  "movq   4096(%5,%%r11,8),%%xmm1\n"
 844 -  "movzb  0x1(%0),%%r11\n"
 845 -  "paddsw %%xmm1,%%xmm0\n"
 846 -  "movq   (%5,%%r10,8),%%xmm2\n"
 847 -  "add    $0x2,%0\n"
 848 -  "movq   (%5,%%r11,8),%%xmm3\n"
 849 -  "paddsw %%xmm0,%%xmm2\n"
 850 -  "paddsw %%xmm0,%%xmm3\n"
 851 -  "shufps $0x44,%%xmm3,%%xmm2\n"
 852 -  "psraw  $0x6,%%xmm2\n"
 853 -  "packuswb %%xmm2,%%xmm2\n"
 854 -  "movq   %%xmm2,0x0(%3)\n"
 855 -  "add    $0x8,%3\n"
 856 -"convertend:"
 857 -  "sub    $0x2,%4\n"
 858 -  "jns    convertloop\n"
 859 -
 860 -"convertnext:"
 861 -  "add    $0x1,%4\n"
 862 -  "js     convertdone\n"
 863 -
 864 -  "movzb  (%1),%%r10\n"
 865 -  "movq   2048(%5,%%r10,8),%%xmm0\n"
 866 -  "movzb  (%2),%%r10\n"
 867 -  "movq   4096(%5,%%r10,8),%%xmm1\n"
 868 -  "paddsw %%xmm1,%%xmm0\n"
 869 -  "movzb  (%0),%%r10\n"
 870 -  "movq   (%5,%%r10,8),%%xmm1\n"
 871 -  "paddsw %%xmm0,%%xmm1\n"
 872 -  "psraw  $0x6,%%xmm1\n"
 873 -  "packuswb %%xmm1,%%xmm1\n"
 874 -  "movd   %%xmm1,0x0(%3)\n"
 875 -"convertdone:"
 876 -  :
 877 -  : "r"(y_buf),  // %0
 878 -    "r"(u_buf),  // %1
 879 -    "r"(v_buf),  // %2
 880 -    "r"(rgb_buf),  // %3
 881 -    "r"(width),  // %4
 882 -    "r" (kCoefficientsRgbY)  // %5
 883 -  : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
 884 -);
 885 -}
 886 -
 887 -void ScaleYUVToRGB32Row(const uint8* y_buf,  // rdi
 888 -                        const uint8* u_buf,  // rsi
 889 -                        const uint8* v_buf,  // rdx
 890 -                        uint8* rgb_buf,      // rcx
 891 -                        int width,           // r8
 892 -                        int source_dx) {     // r9
 893 -  asm(
 894 -  "xor    %%r11,%%r11\n"
 895 -  "sub    $0x2,%4\n"
 896 -  "js     scalenext\n"
 897 -
 898 -"scaleloop:"
 899 -  "mov    %%r11,%%r10\n"
 900 -  "sar    $0x11,%%r10\n"
 901 -  "movzb  (%1,%%r10,1),%%rax\n"
 902 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
 903 -  "movzb  (%2,%%r10,1),%%rax\n"
 904 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
 905 -  "lea    (%%r11,%6),%%r10\n"
 906 -  "sar    $0x10,%%r11\n"
 907 -  "movzb  (%0,%%r11,1),%%rax\n"
 908 -  "paddsw %%xmm1,%%xmm0\n"
 909 -  "movq   (%5,%%rax,8),%%xmm1\n"
 910 -  "lea    (%%r10,%6),%%r11\n"
 911 -  "sar    $0x10,%%r10\n"
 912 -  "movzb  (%0,%%r10,1),%%rax\n"
 913 -  "movq   (%5,%%rax,8),%%xmm2\n"
 914 -  "paddsw %%xmm0,%%xmm1\n"
 915 -  "paddsw %%xmm0,%%xmm2\n"
 916 -  "shufps $0x44,%%xmm2,%%xmm1\n"
 917 -  "psraw  $0x6,%%xmm1\n"
 918 -  "packuswb %%xmm1,%%xmm1\n"
 919 -  "movq   %%xmm1,0x0(%3)\n"
 920 -  "add    $0x8,%3\n"
 921 -  "sub    $0x2,%4\n"
 922 -  "jns    scaleloop\n"
 923 -
 924 -"scalenext:"
 925 -  "add    $0x1,%4\n"
 926 -  "js     scaledone\n"
 927 -
 928 -  "mov    %%r11,%%r10\n"
 929 -  "sar    $0x11,%%r10\n"
 930 -  "movzb  (%1,%%r10,1),%%rax\n"
 931 -  "movq   2048(%5,%%rax,8),%%xmm0\n"
 932 -  "movzb  (%2,%%r10,1),%%rax\n"
 933 -  "movq   4096(%5,%%rax,8),%%xmm1\n"
 934 -  "paddsw %%xmm1,%%xmm0\n"
 935 -  "sar    $0x10,%%r11\n"
 936 -  "movzb  (%0,%%r11,1),%%rax\n"
 937 -  "movq   (%5,%%rax,8),%%xmm1\n"
 938 -  "paddsw %%xmm0,%%xmm1\n"
 939 -  "psraw  $0x6,%%xmm1\n"
 940 -  "packuswb %%xmm1,%%xmm1\n"
 941 -  "movd   %%xmm1,0x0(%3)\n"
 942 -
 943 -"scaledone:"
 944 -  :
 945 -  : "r"(y_buf),  // %0
 946 -    "r"(u_buf),  // %1
 947 -    "r"(v_buf),  // %2
 948 -    "r"(rgb_buf),  // %3
 949 -    "r"(width),  // %4
 950 -    "r" (kCoefficientsRgbY),  // %5
 951 -    "r"(static_cast<long>(source_dx))  // %6
 952 -  : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
 953 -);
 954 -}
 955 -
 956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
 957 -                              const uint8* u_buf,
 958 -                              const uint8* v_buf,
 959 -                              uint8* rgb_buf,
 960 -                              int width,
 961 -                              int source_dx) {
 962 -  asm(
 963 -  "xor    %%r11,%%r11\n"   // x = 0
 964 -  "sub    $0x2,%4\n"
 965 -  "js     .lscalenext\n"
 966 -  "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
 967 -  "jl     .lscalehalf\n"
 968 -  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
 969 -".lscalehalf:"
 970 -
 971 -".lscaleloop:"
 972 -  "mov    %%r11,%%r10\n"
 973 -  "sar    $0x11,%%r10\n"
 974 -
 975 -  "movzb  (%1, %%r10, 1), %%r13 \n"
 976 -  "movzb  1(%1, %%r10, 1), %%r14 \n"
 977 -  "mov    %%r11, %%rax \n"
 978 -  "and    $0x1fffe, %%rax \n"
 979 -  "imul   %%rax, %%r14 \n"
 980 -  "xor    $0x1fffe, %%rax \n"
 981 -  "imul   %%rax, %%r13 \n"
 982 -  "add    %%r14, %%r13 \n"
 983 -  "shr    $17, %%r13 \n"
 984 -  "movq   2048(%5,%%r13,8), %%xmm0\n"
 985 -
 986 -  "movzb  (%2, %%r10, 1), %%r13 \n"
 987 -  "movzb  1(%2, %%r10, 1), %%r14 \n"
 988 -  "mov    %%r11, %%rax \n"
 989 -  "and    $0x1fffe, %%rax \n"
 990 -  "imul   %%rax, %%r14 \n"
 991 -  "xor    $0x1fffe, %%rax \n"
 992 -  "imul   %%rax, %%r13 \n"
 993 -  "add    %%r14, %%r13 \n"
 994 -  "shr    $17, %%r13 \n"
 995 -  "movq   4096(%5,%%r13,8), %%xmm1\n"
 996 -
 997 -  "mov    %%r11, %%rax \n"
 998 -  "lea    (%%r11,%6),%%r10\n"
 999 -  "sar    $0x10,%%r11\n"
1000 -  "paddsw %%xmm1,%%xmm0\n"
1001 -
1002 -  "movzb  (%0, %%r11, 1), %%r13 \n"
1003 -  "movzb  1(%0, %%r11, 1), %%r14 \n"
1004 -  "and    $0xffff, %%rax \n"
1005 -  "imul   %%rax, %%r14 \n"
1006 -  "xor    $0xffff, %%rax \n"
1007 -  "imul   %%rax, %%r13 \n"
1008 -  "add    %%r14, %%r13 \n"
1009 -  "shr    $16, %%r13 \n"
1010 -  "movq   (%5,%%r13,8),%%xmm1\n"
1011 -
1012 -  "mov    %%r10, %%rax \n"
1013 -  "lea    (%%r10,%6),%%r11\n"
1014 -  "sar    $0x10,%%r10\n"
1015 -
1016 -  "movzb  (%0,%%r10,1), %%r13 \n"
1017 -  "movzb  1(%0,%%r10,1), %%r14 \n"
1018 -  "and    $0xffff, %%rax \n"
1019 -  "imul   %%rax, %%r14 \n"
1020 -  "xor    $0xffff, %%rax \n"
1021 -  "imul   %%rax, %%r13 \n"
1022 -  "add    %%r14, %%r13 \n"
1023 -  "shr    $16, %%r13 \n"
1024 -  "movq   (%5,%%r13,8),%%xmm2\n"
1025 -
1026 -  "paddsw %%xmm0,%%xmm1\n"
1027 -  "paddsw %%xmm0,%%xmm2\n"
1028 -  "shufps $0x44,%%xmm2,%%xmm1\n"
1029 -  "psraw  $0x6,%%xmm1\n"
1030 -  "packuswb %%xmm1,%%xmm1\n"
1031 -  "movq   %%xmm1,0x0(%3)\n"
1032 -  "add    $0x8,%3\n"
1033 -  "sub    $0x2,%4\n"
1034 -  "jns    .lscaleloop\n"
1035 -
1036 -".lscalenext:"
1037 -  "add    $0x1,%4\n"
1038 -  "js     .lscaledone\n"
1039 -
1040 -  "mov    %%r11,%%r10\n"
1041 -  "sar    $0x11,%%r10\n"
1042 -
1043 -  "movzb  (%1,%%r10,1), %%r13 \n"
1044 -  "movq   2048(%5,%%r13,8),%%xmm0\n"
1045 -
1046 -  "movzb  (%2,%%r10,1), %%r13 \n"
1047 -  "movq   4096(%5,%%r13,8),%%xmm1\n"
1048 -
1049 -  "paddsw %%xmm1,%%xmm0\n"
1050 -  "sar    $0x10,%%r11\n"
1051 -
1052 -  "movzb  (%0,%%r11,1), %%r13 \n"
1053 -  "movq   (%5,%%r13,8),%%xmm1\n"
1054 -
1055 -  "paddsw %%xmm0,%%xmm1\n"
1056 -  "psraw  $0x6,%%xmm1\n"
1057 -  "packuswb %%xmm1,%%xmm1\n"
1058 -  "movd   %%xmm1,0x0(%3)\n"
1059 -
1060 -".lscaledone:"
1061 -  :
1062 -  : "r"(y_buf),  // %0
1063 -    "r"(u_buf),  // %1
1064 -    "r"(v_buf),  // %2
1065 -    "r"(rgb_buf),  // %3
1066 -    "r"(width),  // %4
1067 -    "r" (kCoefficientsRgbY),  // %5
1068 -    "r"(static_cast<long>(source_dx))  // %6
1069 -  : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1070 -);
1071 -}
1072 -
1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1074 -
1075 -// PIC version is slower because less registers are available, so
1076 -// non-PIC is used on platforms where it is possible.
1077 -
1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1079 -                              const uint8* u_buf,
1080 -                              const uint8* v_buf,
1081 -                              uint8* rgb_buf,
1082 -                              int width);
1083 -  asm(
1084 -  ".text\n"
1085 -  ".global FastConvertYUVToRGB32Row\n"
1086 -"FastConvertYUVToRGB32Row:\n"
1087 -  "pusha\n"
1088 -  "mov    0x24(%esp),%edx\n"
1089 -  "mov    0x28(%esp),%edi\n"
1090 -  "mov    0x2c(%esp),%esi\n"
1091 -  "mov    0x30(%esp),%ebp\n"
1092 -  "mov    0x34(%esp),%ecx\n"
1093 -  "jmp    convertend\n"
1094 -
1095 -"convertloop:"
1096 -  "movzbl (%edi),%eax\n"
1097 -  "add    $0x1,%edi\n"
1098 -  "movzbl (%esi),%ebx\n"
1099 -  "add    $0x1,%esi\n"
1100 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1101 -  "movzbl (%edx),%eax\n"
1102 -  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1103 -  "movzbl 0x1(%edx),%ebx\n"
1104 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1105 -  "add    $0x2,%edx\n"
1106 -  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
1107 -  "paddsw %mm0,%mm1\n"
1108 -  "paddsw %mm0,%mm2\n"
1109 -  "psraw  $0x6,%mm1\n"
1110 -  "psraw  $0x6,%mm2\n"
1111 -  "packuswb %mm2,%mm1\n"
1112 -  "movntq %mm1,0x0(%ebp)\n"
1113 -  "add    $0x8,%ebp\n"
1114 -"convertend:"
1115 -  "sub    $0x2,%ecx\n"
1116 -  "jns    convertloop\n"
1117 -
1118 -  "and    $0x1,%ecx\n"
1119 -  "je     convertdone\n"
1120 -
1121 -  "movzbl (%edi),%eax\n"
1122 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1123 -  "movzbl (%esi),%eax\n"
1124 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1125 -  "movzbl (%edx),%eax\n"
1126 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1127 -  "paddsw %mm0,%mm1\n"
1128 -  "psraw  $0x6,%mm1\n"
1129 -  "packuswb %mm1,%mm1\n"
1130 -  "movd   %mm1,0x0(%ebp)\n"
1131 -"convertdone:"
1132 -  "popa\n"
1133 -  "ret\n"
1134 -);
1135 -
1136 -
1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1138 -                        const uint8* u_buf,
1139 -                        const uint8* v_buf,
1140 -                        uint8* rgb_buf,
1141 -                        int width,
1142 -                        int source_dx);
1143 -  asm(
1144 -  ".text\n"
1145 -  ".global ScaleYUVToRGB32Row\n"
1146 -"ScaleYUVToRGB32Row:\n"
1147 -  "pusha\n"
1148 -  "mov    0x24(%esp),%edx\n"
1149 -  "mov    0x28(%esp),%edi\n"
1150 -  "mov    0x2c(%esp),%esi\n"
1151 -  "mov    0x30(%esp),%ebp\n"
1152 -  "mov    0x34(%esp),%ecx\n"
1153 -  "xor    %ebx,%ebx\n"
1154 -  "jmp    scaleend\n"
1155 -
1156 -"scaleloop:"
1157 -  "mov    %ebx,%eax\n"
1158 -  "sar    $0x11,%eax\n"
1159 -  "movzbl (%edi,%eax,1),%eax\n"
1160 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1161 -  "mov    %ebx,%eax\n"
1162 -  "sar    $0x11,%eax\n"
1163 -  "movzbl (%esi,%eax,1),%eax\n"
1164 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1165 -  "mov    %ebx,%eax\n"
1166 -  "add    0x38(%esp),%ebx\n"
1167 -  "sar    $0x10,%eax\n"
1168 -  "movzbl (%edx,%eax,1),%eax\n"
1169 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1170 -  "mov    %ebx,%eax\n"
1171 -  "add    0x38(%esp),%ebx\n"
1172 -  "sar    $0x10,%eax\n"
1173 -  "movzbl (%edx,%eax,1),%eax\n"
1174 -  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
1175 -  "paddsw %mm0,%mm1\n"
1176 -  "paddsw %mm0,%mm2\n"
1177 -  "psraw  $0x6,%mm1\n"
1178 -  "psraw  $0x6,%mm2\n"
1179 -  "packuswb %mm2,%mm1\n"
1180 -  "movntq %mm1,0x0(%ebp)\n"
1181 -  "add    $0x8,%ebp\n"
1182 -"scaleend:"
1183 -  "sub    $0x2,%ecx\n"
1184 -  "jns    scaleloop\n"
1185 -
1186 -  "and    $0x1,%ecx\n"
1187 -  "je     scaledone\n"
1188 -
1189 -  "mov    %ebx,%eax\n"
1190 -  "sar    $0x11,%eax\n"
1191 -  "movzbl (%edi,%eax,1),%eax\n"
1192 -  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1193 -  "mov    %ebx,%eax\n"
1194 -  "sar    $0x11,%eax\n"
1195 -  "movzbl (%esi,%eax,1),%eax\n"
1196 -  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1197 -  "mov    %ebx,%eax\n"
1198 -  "sar    $0x10,%eax\n"
1199 -  "movzbl (%edx,%eax,1),%eax\n"
1200 -  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
1201 -  "paddsw %mm0,%mm1\n"
1202 -  "psraw  $0x6,%mm1\n"
1203 -  "packuswb %mm1,%mm1\n"
1204 -  "movd   %mm1,0x0(%ebp)\n"
1205 -
1206 -"scaledone:"
1207 -  "popa\n"
1208 -  "ret\n"
1209 -);
1210 -
1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1212 -                              const uint8* u_buf,
1213 -                              const uint8* v_buf,
1214 -                              uint8* rgb_buf,
1215 -                              int width,
1216 -                              int source_dx);
1217 -  asm(
1218 -  ".text\n"
1219 -  ".global LinearScaleYUVToRGB32Row\n"
1220 -"LinearScaleYUVToRGB32Row:\n"
1221 -  "pusha\n"
1222 -  "mov    0x24(%esp),%edx\n"
1223 -  "mov    0x28(%esp),%edi\n"
1224 -  "mov    0x30(%esp),%ebp\n"
1225 -
1226 -  // source_width = width * source_dx + ebx
1227 -  "mov    0x34(%esp), %ecx\n"
1228 -  "imull  0x38(%esp), %ecx\n"
1229 -  "mov    %ecx, 0x34(%esp)\n"
1230 -
1231 -  "mov    0x38(%esp), %ecx\n"
1232 -  "xor    %ebx,%ebx\n"     // x = 0
1233 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
1234 -  "jl     .lscaleend\n"
1235 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
1236 -  "jmp    .lscaleend\n"
1237 -
1238 -".lscaleloop:"
1239 -  "mov    %ebx,%eax\n"
1240 -  "sar    $0x11,%eax\n"
1241 -
1242 -  "movzbl (%edi,%eax,1),%ecx\n"
1243 -  "movzbl 1(%edi,%eax,1),%esi\n"
1244 -  "mov    %ebx,%eax\n"
1245 -  "andl   $0x1fffe, %eax \n"
1246 -  "imul   %eax, %esi \n"
1247 -  "xorl   $0x1fffe, %eax \n"
1248 -  "imul   %eax, %ecx \n"
1249 -  "addl   %esi, %ecx \n"
1250 -  "shrl   $17, %ecx \n"
1251 -  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1252 -
1253 -  "mov    0x2c(%esp),%esi\n"
1254 -  "mov    %ebx,%eax\n"
1255 -  "sar    $0x11,%eax\n"
1256 -
1257 -  "movzbl (%esi,%eax,1),%ecx\n"
1258 -  "movzbl 1(%esi,%eax,1),%esi\n"
1259 -  "mov    %ebx,%eax\n"
1260 -  "andl   $0x1fffe, %eax \n"
1261 -  "imul   %eax, %esi \n"
1262 -  "xorl   $0x1fffe, %eax \n"
1263 -  "imul   %eax, %ecx \n"
1264 -  "addl   %esi, %ecx \n"
1265 -  "shrl   $17, %ecx \n"
1266 -  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1267 -
1268 -  "mov    %ebx,%eax\n"
1269 -  "sar    $0x10,%eax\n"
1270 -  "movzbl (%edx,%eax,1),%ecx\n"
1271 -  "movzbl 1(%edx,%eax,1),%esi\n"
1272 -  "mov    %ebx,%eax\n"
1273 -  "add    0x38(%esp),%ebx\n"
1274 -  "andl   $0xffff, %eax \n"
1275 -  "imul   %eax, %esi \n"
1276 -  "xorl   $0xffff, %eax \n"
1277 -  "imul   %eax, %ecx \n"
1278 -  "addl   %esi, %ecx \n"
1279 -  "shrl   $16, %ecx \n"
1280 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
1281 -
1282 -  "cmp    0x34(%esp), %ebx\n"
1283 -  "jge    .lscalelastpixel\n"
1284 -
1285 -  "mov    %ebx,%eax\n"
1286 -  "sar    $0x10,%eax\n"
1287 -  "movzbl (%edx,%eax,1),%ecx\n"
1288 -  "movzbl 1(%edx,%eax,1),%esi\n"
1289 -  "mov    %ebx,%eax\n"
1290 -  "add    0x38(%esp),%ebx\n"
1291 -  "andl   $0xffff, %eax \n"
1292 -  "imul   %eax, %esi \n"
1293 -  "xorl   $0xffff, %eax \n"
1294 -  "imul   %eax, %ecx \n"
1295 -  "addl   %esi, %ecx \n"
1296 -  "shrl   $16, %ecx \n"
1297 -  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
1298 -
1299 -  "paddsw %mm0,%mm1\n"
1300 -  "paddsw %mm0,%mm2\n"
1301 -  "psraw  $0x6,%mm1\n"
1302 -  "psraw  $0x6,%mm2\n"
1303 -  "packuswb %mm2,%mm1\n"
1304 -  "movntq %mm1,0x0(%ebp)\n"
1305 -  "add    $0x8,%ebp\n"
1306 -
1307 -".lscaleend:"
1308 -  "cmp    0x34(%esp), %ebx\n"
1309 -  "jl     .lscaleloop\n"
1310 -  "popa\n"
1311 -  "ret\n"
1312 -
1313 -".lscalelastpixel:"
1314 -  "paddsw %mm0, %mm1\n"
1315 -  "psraw $6, %mm1\n"
1316 -  "packuswb %mm1, %mm1\n"
1317 -  "movd %mm1, (%ebp)\n"
1318 -  "popa\n"
1319 -  "ret\n"
1320 -);
1321 -
1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1323 -
1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1325 -                                    const uint8* u_buf,
1326 -                                    const uint8* v_buf,
1327 -                                    uint8* rgb_buf,
1328 -                                    int width,
1329 -                                    int16 *kCoefficientsRgbY);
1330 -  asm(
1331 -  ".text\n"
1332 -#if defined(OS_MACOSX)
1333 -"_PICConvertYUVToRGB32Row:\n"
1334 -#else
1335 -"PICConvertYUVToRGB32Row:\n"
1336 -#endif
1337 -  "pusha\n"
1338 -  "mov    0x24(%esp),%edx\n"
1339 -  "mov    0x28(%esp),%edi\n"
1340 -  "mov    0x2c(%esp),%esi\n"
1341 -  "mov    0x30(%esp),%ebp\n"
1342 -  "mov    0x38(%esp),%ecx\n"
1343 -
1344 -  "jmp    .Lconvertend\n"
1345 -
1346 -".Lconvertloop:"
1347 -  "movzbl (%edi),%eax\n"
1348 -  "add    $0x1,%edi\n"
1349 -  "movzbl (%esi),%ebx\n"
1350 -  "add    $0x1,%esi\n"
1351 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1352 -  "movzbl (%edx),%eax\n"
1353 -  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1354 -  "movzbl 0x1(%edx),%ebx\n"
1355 -  "movq   0(%ecx,%eax,8),%mm1\n"
1356 -  "add    $0x2,%edx\n"
1357 -  "movq   0(%ecx,%ebx,8),%mm2\n"
1358 -  "paddsw %mm0,%mm1\n"
1359 -  "paddsw %mm0,%mm2\n"
1360 -  "psraw  $0x6,%mm1\n"
1361 -  "psraw  $0x6,%mm2\n"
1362 -  "packuswb %mm2,%mm1\n"
1363 -  "movntq %mm1,0x0(%ebp)\n"
1364 -  "add    $0x8,%ebp\n"
1365 -".Lconvertend:"
1366 -  "subl   $0x2,0x34(%esp)\n"
1367 -  "jns    .Lconvertloop\n"
1368 -
1369 -  "andl   $0x1,0x34(%esp)\n"
1370 -  "je     .Lconvertdone\n"
1371 -
1372 -  "movzbl (%edi),%eax\n"
1373 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1374 -  "movzbl (%esi),%eax\n"
1375 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1376 -  "movzbl (%edx),%eax\n"
1377 -  "movq   0(%ecx,%eax,8),%mm1\n"
1378 -  "paddsw %mm0,%mm1\n"
1379 -  "psraw  $0x6,%mm1\n"
1380 -  "packuswb %mm1,%mm1\n"
1381 -  "movd   %mm1,0x0(%ebp)\n"
1382 -".Lconvertdone:\n"
1383 -  "popa\n"
1384 -  "ret\n"
1385 -);
1386 -
1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1388 -                              const uint8* u_buf,
1389 -                              const uint8* v_buf,
1390 -                              uint8* rgb_buf,
1391 -                              int width) {
1392 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1393 -                          &kCoefficientsRgbY[0][0]);
1394 -}
1395 -
1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1397 -                               const uint8* u_buf,
1398 -                               const uint8* v_buf,
1399 -                               uint8* rgb_buf,
1400 -                               int width,
1401 -                               int source_dx,
1402 -                               int16 *kCoefficientsRgbY);
1403 -
1404 -  asm(
1405 -  ".text\n"
1406 -#if defined(OS_MACOSX)
1407 -"_PICScaleYUVToRGB32Row:\n"
1408 -#else
1409 -"PICScaleYUVToRGB32Row:\n"
1410 -#endif
1411 -  "pusha\n"
1412 -  "mov    0x24(%esp),%edx\n"
1413 -  "mov    0x28(%esp),%edi\n"
1414 -  "mov    0x2c(%esp),%esi\n"
1415 -  "mov    0x30(%esp),%ebp\n"
1416 -  "mov    0x3c(%esp),%ecx\n"
1417 -  "xor    %ebx,%ebx\n"
1418 -  "jmp    Lscaleend\n"
1419 -
1420 -"Lscaleloop:"
1421 -  "mov    %ebx,%eax\n"
1422 -  "sar    $0x11,%eax\n"
1423 -  "movzbl (%edi,%eax,1),%eax\n"
1424 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1425 -  "mov    %ebx,%eax\n"
1426 -  "sar    $0x11,%eax\n"
1427 -  "movzbl (%esi,%eax,1),%eax\n"
1428 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1429 -  "mov    %ebx,%eax\n"
1430 -  "add    0x38(%esp),%ebx\n"
1431 -  "sar    $0x10,%eax\n"
1432 -  "movzbl (%edx,%eax,1),%eax\n"
1433 -  "movq   0(%ecx,%eax,8),%mm1\n"
1434 -  "mov    %ebx,%eax\n"
1435 -  "add    0x38(%esp),%ebx\n"
1436 -  "sar    $0x10,%eax\n"
1437 -  "movzbl (%edx,%eax,1),%eax\n"
1438 -  "movq   0(%ecx,%eax,8),%mm2\n"
1439 -  "paddsw %mm0,%mm1\n"
1440 -  "paddsw %mm0,%mm2\n"
1441 -  "psraw  $0x6,%mm1\n"
1442 -  "psraw  $0x6,%mm2\n"
1443 -  "packuswb %mm2,%mm1\n"
1444 -  "movntq %mm1,0x0(%ebp)\n"
1445 -  "add    $0x8,%ebp\n"
1446 -"Lscaleend:"
1447 -  "subl   $0x2,0x34(%esp)\n"
1448 -  "jns    Lscaleloop\n"
1449 -
1450 -  "andl   $0x1,0x34(%esp)\n"
1451 -  "je     Lscaledone\n"
1452 -
1453 -  "mov    %ebx,%eax\n"
1454 -  "sar    $0x11,%eax\n"
1455 -  "movzbl (%edi,%eax,1),%eax\n"
1456 -  "movq   2048(%ecx,%eax,8),%mm0\n"
1457 -  "mov    %ebx,%eax\n"
1458 -  "sar    $0x11,%eax\n"
1459 -  "movzbl (%esi,%eax,1),%eax\n"
1460 -  "paddsw 4096(%ecx,%eax,8),%mm0\n"
1461 -  "mov    %ebx,%eax\n"
1462 -  "sar    $0x10,%eax\n"
1463 -  "movzbl (%edx,%eax,1),%eax\n"
1464 -  "movq   0(%ecx,%eax,8),%mm1\n"
1465 -  "paddsw %mm0,%mm1\n"
1466 -  "psraw  $0x6,%mm1\n"
1467 -  "packuswb %mm1,%mm1\n"
1468 -  "movd   %mm1,0x0(%ebp)\n"
1469 -
1470 -"Lscaledone:"
1471 -  "popa\n"
1472 -  "ret\n"
1473 -);
1474 -
1475 -
1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1477 -                        const uint8* u_buf,
1478 -                        const uint8* v_buf,
1479 -                        uint8* rgb_buf,
1480 -                        int width,
1481 -                        int source_dx) {
1482 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1483 -                        &kCoefficientsRgbY[0][0]);
1484 -}
1485 -
1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1487 -                                 const uint8* u_buf,
1488 -                                 const uint8* v_buf,
1489 -                                 uint8* rgb_buf,
1490 -                                 int width,
1491 -                                 int source_dx,
1492 -                                 int16 *kCoefficientsRgbY);
1493 -  asm(
1494 -  ".text\n"
1495 -#if defined(OS_MACOSX)
1496 -"_PICLinearScaleYUVToRGB32Row:\n"
1497 -#else
1498 -"PICLinearScaleYUVToRGB32Row:\n"
1499 -#endif
1500 -  "pusha\n"
1501 -  "mov    0x24(%esp),%edx\n"
1502 -  "mov    0x30(%esp),%ebp\n"
1503 -  "mov    0x34(%esp),%ecx\n"
1504 -  "mov    0x3c(%esp),%edi\n"
1505 -  "xor    %ebx,%ebx\n"
1506 -
1507 -  // source_width = width * source_dx + ebx
1508 -  "mov    0x34(%esp), %ecx\n"
1509 -  "imull  0x38(%esp), %ecx\n"
1510 -  "mov    %ecx, 0x34(%esp)\n"
1511 -
1512 -  "mov    0x38(%esp), %ecx\n"
1513 -  "xor    %ebx,%ebx\n"     // x = 0
1514 -  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
1515 -  "jl     .lscaleend\n"
1516 -  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
1517 -  "jmp    .lscaleend\n"
1518 -
1519 -".lscaleloop:"
1520 -  "mov    0x28(%esp),%esi\n"
1521 -  "mov    %ebx,%eax\n"
1522 -  "sar    $0x11,%eax\n"
1523 -
1524 -  "movzbl (%esi,%eax,1),%ecx\n"
1525 -  "movzbl 1(%esi,%eax,1),%esi\n"
1526 -  "mov    %ebx,%eax\n"
1527 -  "andl   $0x1fffe, %eax \n"
1528 -  "imul   %eax, %esi \n"
1529 -  "xorl   $0x1fffe, %eax \n"
1530 -  "imul   %eax, %ecx \n"
1531 -  "addl   %esi, %ecx \n"
1532 -  "shrl   $17, %ecx \n"
1533 -  "movq   2048(%edi,%ecx,8),%mm0\n"
1534 -
1535 -  "mov    0x2c(%esp),%esi\n"
1536 -  "mov    %ebx,%eax\n"
1537 -  "sar    $0x11,%eax\n"
1538 -
1539 -  "movzbl (%esi,%eax,1),%ecx\n"
1540 -  "movzbl 1(%esi,%eax,1),%esi\n"
1541 -  "mov    %ebx,%eax\n"
1542 -  "andl   $0x1fffe, %eax \n"
1543 -  "imul   %eax, %esi \n"
1544 -  "xorl   $0x1fffe, %eax \n"
1545 -  "imul   %eax, %ecx \n"
1546 -  "addl   %esi, %ecx \n"
1547 -  "shrl   $17, %ecx \n"
1548 -  "paddsw 4096(%edi,%ecx,8),%mm0\n"
1549 -
1550 -  "mov    %ebx,%eax\n"
1551 -  "sar    $0x10,%eax\n"
1552 -  "movzbl (%edx,%eax,1),%ecx\n"
1553 -  "movzbl 1(%edx,%eax,1),%esi\n"
1554 -  "mov    %ebx,%eax\n"
1555 -  "add    0x38(%esp),%ebx\n"
1556 -  "andl   $0xffff, %eax \n"
1557 -  "imul   %eax, %esi \n"
1558 -  "xorl   $0xffff, %eax \n"
1559 -  "imul   %eax, %ecx \n"
1560 -  "addl   %esi, %ecx \n"
1561 -  "shrl   $16, %ecx \n"
1562 -  "movq   (%edi,%ecx,8),%mm1\n"
1563 -
1564 -  "cmp    0x34(%esp), %ebx\n"
1565 -  "jge    .lscalelastpixel\n"
1566 -
1567 -  "mov    %ebx,%eax\n"
1568 -  "sar    $0x10,%eax\n"
1569 -  "movzbl (%edx,%eax,1),%ecx\n"
1570 -  "movzbl 1(%edx,%eax,1),%esi\n"
1571 -  "mov    %ebx,%eax\n"
1572 -  "add    0x38(%esp),%ebx\n"
1573 -  "andl   $0xffff, %eax \n"
1574 -  "imul   %eax, %esi \n"
1575 -  "xorl   $0xffff, %eax \n"
1576 -  "imul   %eax, %ecx \n"
1577 -  "addl   %esi, %ecx \n"
1578 -  "shrl   $16, %ecx \n"
1579 -  "movq   (%edi,%ecx,8),%mm2\n"
1580 -
1581 -  "paddsw %mm0,%mm1\n"
1582 -  "paddsw %mm0,%mm2\n"
1583 -  "psraw  $0x6,%mm1\n"
1584 -  "psraw  $0x6,%mm2\n"
1585 -  "packuswb %mm2,%mm1\n"
1586 -  "movntq %mm1,0x0(%ebp)\n"
1587 -  "add    $0x8,%ebp\n"
1588 -
1589 -".lscaleend:"
1590 -  "cmp    %ebx, 0x34(%esp)\n"
1591 -  "jg     .lscaleloop\n"
1592 -  "popa\n"
1593 -  "ret\n"
1594 -
1595 -".lscalelastpixel:"
1596 -  "paddsw %mm0, %mm1\n"
1597 -  "psraw $6, %mm1\n"
1598 -  "packuswb %mm1, %mm1\n"
1599 -  "movd %mm1, (%ebp)\n"
1600 -  "popa\n"
1601 -  "ret\n"
1602 -);
1603 -
1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1605 -                        const uint8* u_buf,
1606 -                        const uint8* v_buf,
1607 -                        uint8* rgb_buf,
1608 -                        int width,
1609 -                        int source_dx) {
1610 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1611 -                              &kCoefficientsRgbY[0][0]);
1612 -}
1613 -
1614 -#else  // USE_MMX
1615 -
1616  // C reference code that mimic the YUV assembly.
1617  #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1618  #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1619      (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1620
1621  static inline void YuvPixel(uint8 y,
1622                              uint8 u,
1623                              uint8 v,
1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1625    a >>= 6;
1626
1627    *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1628                                          (packuswb(g) << 8) |
1629                                          (packuswb(r) << 16) |
1630                                          (packuswb(a) << 24);
1631  }
1632
1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1634 -                              const uint8* u_buf,
1635 -                              const uint8* v_buf,
1636 -                              uint8* rgb_buf,
1637 -                              int width) {
1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1639 +                                const uint8* u_buf,
1640 +                                const uint8* v_buf,
1641 +                                uint8* rgb_buf,
1642 +                                int width,
1643 +                                unsigned int x_shift) {
1644    for (int x = 0; x < width; x += 2) {
1645 -    uint8 u = u_buf[x >> 1];
1646 -    uint8 v = v_buf[x >> 1];
1647 +    uint8 u = u_buf[x >> x_shift];
1648 +    uint8 v = v_buf[x >> x_shift];
1649      uint8 y0 = y_buf[x];
1650      YuvPixel(y0, u, v, rgb_buf);
1651      if ((x + 1) < width) {
1652        uint8 y1 = y_buf[x + 1];
1653 +      if (x_shift == 0) {
1654 +        u = u_buf[x + 1];
1655 +        v = v_buf[x + 1];
1656 +      }
1657        YuvPixel(y1, u, v, rgb_buf + 4);
1658      }
1659      rgb_buf += 8;  // Advance 2 pixels.
1660    }
1661  }
1662
1663  // 16.16 fixed point is used.  A shift by 16 isolates the integer.
1664  // A shift by 17 is used to further subsample the chrominence channels.
1665  // & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
1666  // for 1/65536 pixel accurate interpolation.
1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1668 -                        const uint8* u_buf,
1669 -                        const uint8* v_buf,
1670 -                        uint8* rgb_buf,
1671 -                        int width,
1672 -                        int source_dx) {
1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1674 +                          const uint8* u_buf,
1675 +                          const uint8* v_buf,
1676 +                          uint8* rgb_buf,
1677 +                          int width,
1678 +                          int source_dx) {
1679    int x = 0;
1680    for (int i = 0; i < width; i += 2) {
1681      int y = y_buf[x >> 16];
1682      int u = u_buf[(x >> 17)];
1683      int v = v_buf[(x >> 17)];
1684      YuvPixel(y, u, v, rgb_buf);
1685      x += source_dx;
1686      if ((i + 1) < width) {
1687        y = y_buf[x >> 16];
1688        YuvPixel(y, u, v, rgb_buf+4);
1689        x += source_dx;
1690      }
1691      rgb_buf += 8;
1692    }
1693  }
1694
1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1696 -                              const uint8* u_buf,
1697 -                              const uint8* v_buf,
1698 -                              uint8* rgb_buf,
1699 -                              int width,
1700 -                              int source_dx) {
1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1702 +                                const uint8* u_buf,
1703 +                                const uint8* v_buf,
1704 +                                uint8* rgb_buf,
1705 +                                int width,
1706 +                                int source_dx) {
1707    int x = 0;
1708    if (source_dx >= 0x20000) {
1709      x = 32768;
1710    }
1711    for (int i = 0; i < width; i += 2) {
1712      int y0 = y_buf[x >> 16];
1713      int y1 = y_buf[(x >> 16) + 1];
1714      int u0 = u_buf[(x >> 17)];
1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1716        y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1717        YuvPixel(y, u, v, rgb_buf+4);
1718        x += source_dx;
1719      }
1720      rgb_buf += 8;
1721    }
1722  }
1723
1724 -#endif  // USE_MMX
1725  }  // extern "C"
1726
1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1728 --- a/gfx/ycbcr/yuv_row_posix.cpp
1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1730 @@ -1,33 +1,32 @@
1731  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1732  // Use of this source code is governed by a BSD-style license that can be
1733  // found in the LICENSE file.
1734
1735 -#include "media/base/yuv_row.h"
1736 -
1737 -#ifdef _DEBUG
1738 -#include "base/logging.h"
1739 -#else
1740 +#include "yuv_row.h"
1741 +#include "mozilla/SSE.h"
1742 +
1743  #define DCHECK(a)
1744 -#endif
1745
1746  extern "C" {
1747
1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1749 +#if defined(ARCH_CPU_X86_64)
1750 +
1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1752
1753  // AMD64 ABI uses register paremters.
1754  void FastConvertYUVToRGB32Row(const uint8* y_buf,  // rdi
1755                                const uint8* u_buf,  // rsi
1756                                const uint8* v_buf,  // rdx
1757                                uint8* rgb_buf,      // rcx
1758                                int width) {         // r8
1759    asm(
1760 -  "jmp    convertend\n"
1761 -"convertloop:"
1762 +  "jmp    1f\n"
1763 +"0:"
1764    "movzb  (%1),%%r10\n"
1765    "add    $0x1,%1\n"
1766    "movzb  (%2),%%r11\n"
1767    "add    $0x1,%2\n"
1768    "movq   2048(%5,%%r10,8),%%xmm0\n"
1769    "movzb  (%0),%%r10\n"
1770    "movq   4096(%5,%%r11,8),%%xmm1\n"
1771    "movzb  0x1(%0),%%r11\n"
1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1773    "movq   (%5,%%r11,8),%%xmm3\n"
1774    "paddsw %%xmm0,%%xmm2\n"
1775    "paddsw %%xmm0,%%xmm3\n"
1776    "shufps $0x44,%%xmm3,%%xmm2\n"
1777    "psraw  $0x6,%%xmm2\n"
1778    "packuswb %%xmm2,%%xmm2\n"
1779    "movq   %%xmm2,0x0(%3)\n"
1780    "add    $0x8,%3\n"
1781 -"convertend:"
1782 +"1:"
1783    "sub    $0x2,%4\n"
1784 -  "jns    convertloop\n"
1785 -
1786 -"convertnext:"
1787 +  "jns    0b\n"
1788 +
1789 +"2:"
1790    "add    $0x1,%4\n"
1791 -  "js     convertdone\n"
1792 +  "js     3f\n"
1793
1794    "movzb  (%1),%%r10\n"
1795    "movq   2048(%5,%%r10,8),%%xmm0\n"
1796    "movzb  (%2),%%r10\n"
1797    "movq   4096(%5,%%r10,8),%%xmm1\n"
1798    "paddsw %%xmm1,%%xmm0\n"
1799    "movzb  (%0),%%r10\n"
1800    "movq   (%5,%%r10,8),%%xmm1\n"
1801    "paddsw %%xmm0,%%xmm1\n"
1802    "psraw  $0x6,%%xmm1\n"
1803    "packuswb %%xmm1,%%xmm1\n"
1804    "movd   %%xmm1,0x0(%3)\n"
1805 -"convertdone:"
1806 +"3:"
1807    :
1808    : "r"(y_buf),  // %0
1809      "r"(u_buf),  // %1
1810      "r"(v_buf),  // %2
1811      "r"(rgb_buf),  // %3
1812      "r"(width),  // %4
1813      "r" (kCoefficientsRgbY)  // %5
1814    : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1816                          const uint8* u_buf,  // rsi
1817                          const uint8* v_buf,  // rdx
1818                          uint8* rgb_buf,      // rcx
1819                          int width,           // r8
1820                          int source_dx) {     // r9
1821    asm(
1822    "xor    %%r11,%%r11\n"
1823    "sub    $0x2,%4\n"
1824 -  "js     scalenext\n"
1825 -
1826 -"scaleloop:"
1827 +  "js     1f\n"
1828 +
1829 +"0:"
1830    "mov    %%r11,%%r10\n"
1831    "sar    $0x11,%%r10\n"
1832    "movzb  (%1,%%r10,1),%%rax\n"
1833    "movq   2048(%5,%%rax,8),%%xmm0\n"
1834    "movzb  (%2,%%r10,1),%%rax\n"
1835    "movq   4096(%5,%%rax,8),%%xmm1\n"
1836    "lea    (%%r11,%6),%%r10\n"
1837    "sar    $0x10,%%r11\n"
1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1839    "paddsw %%xmm0,%%xmm1\n"
1840    "paddsw %%xmm0,%%xmm2\n"
1841    "shufps $0x44,%%xmm2,%%xmm1\n"
1842    "psraw  $0x6,%%xmm1\n"
1843    "packuswb %%xmm1,%%xmm1\n"
1844    "movq   %%xmm1,0x0(%3)\n"
1845    "add    $0x8,%3\n"
1846    "sub    $0x2,%4\n"
1847 -  "jns    scaleloop\n"
1848 -
1849 -"scalenext:"
1850 +  "jns    0b\n"
1851 +
1852 +"1:"
1853    "add    $0x1,%4\n"
1854 -  "js     scaledone\n"
1855 +  "js     2f\n"
1856
1857    "mov    %%r11,%%r10\n"
1858    "sar    $0x11,%%r10\n"
1859    "movzb  (%1,%%r10,1),%%rax\n"
1860    "movq   2048(%5,%%rax,8),%%xmm0\n"
1861    "movzb  (%2,%%r10,1),%%rax\n"
1862    "movq   4096(%5,%%rax,8),%%xmm1\n"
1863    "paddsw %%xmm1,%%xmm0\n"
1864    "sar    $0x10,%%r11\n"
1865    "movzb  (%0,%%r11,1),%%rax\n"
1866    "movq   (%5,%%rax,8),%%xmm1\n"
1867    "paddsw %%xmm0,%%xmm1\n"
1868    "psraw  $0x6,%%xmm1\n"
1869    "packuswb %%xmm1,%%xmm1\n"
1870    "movd   %%xmm1,0x0(%3)\n"
1871
1872 -"scaledone:"
1873 +"2:"
1874    :
1875    : "r"(y_buf),  // %0
1876      "r"(u_buf),  // %1
1877      "r"(v_buf),  // %2
1878      "r"(rgb_buf),  // %3
1879      "r"(width),  // %4
1880      "r" (kCoefficientsRgbY),  // %5
1881      "r"(static_cast<long>(source_dx))  // %6
1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1883                                const uint8* u_buf,
1884                                const uint8* v_buf,
1885                                uint8* rgb_buf,
1886                                int width,
1887                                int source_dx) {
1888    asm(
1889    "xor    %%r11,%%r11\n"   // x = 0
1890    "sub    $0x2,%4\n"
1891 -  "js     .lscalenext\n"
1892 +  "js     2f\n"
1893    "cmp    $0x20000,%6\n"   // if source_dx >= 2.0
1894 -  "jl     .lscalehalf\n"
1895 +  "jl     0f\n"
1896    "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1897 -".lscalehalf:"
1898 -
1899 -".lscaleloop:"
1900 +"0:"
1901 +
1902 +"1:"
1903    "mov    %%r11,%%r10\n"
1904    "sar    $0x11,%%r10\n"
1905
1906    "movzb  (%1, %%r10, 1), %%r13 \n"
1907    "movzb  1(%1, %%r10, 1), %%r14 \n"
1908    "mov    %%r11, %%rax \n"
1909    "and    $0x1fffe, %%rax \n"
1910    "imul   %%rax, %%r14 \n"
1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1912    "paddsw %%xmm0,%%xmm1\n"
1913    "paddsw %%xmm0,%%xmm2\n"
1914    "shufps $0x44,%%xmm2,%%xmm1\n"
1915    "psraw  $0x6,%%xmm1\n"
1916    "packuswb %%xmm1,%%xmm1\n"
1917    "movq   %%xmm1,0x0(%3)\n"
1918    "add    $0x8,%3\n"
1919    "sub    $0x2,%4\n"
1920 -  "jns    .lscaleloop\n"
1921 -
1922 -".lscalenext:"
1923 +  "jns    1b\n"
1924 +
1925 +"2:"
1926    "add    $0x1,%4\n"
1927 -  "js     .lscaledone\n"
1928 +  "js     3f\n"
1929
1930    "mov    %%r11,%%r10\n"
1931    "sar    $0x11,%%r10\n"
1932
1933    "movzb  (%1,%%r10,1), %%r13 \n"
1934    "movq   2048(%5,%%r13,8),%%xmm0\n"
1935
1936    "movzb  (%2,%%r10,1), %%r13 \n"
1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
1938    "movzb  (%0,%%r11,1), %%r13 \n"
1939    "movq   (%5,%%r13,8),%%xmm1\n"
1940
1941    "paddsw %%xmm0,%%xmm1\n"
1942    "psraw  $0x6,%%xmm1\n"
1943    "packuswb %%xmm1,%%xmm1\n"
1944    "movd   %%xmm1,0x0(%3)\n"
1945
1946 -".lscaledone:"
1947 +"3:"
1948    :
1949    : "r"(y_buf),  // %0
1950      "r"(u_buf),  // %1
1951      "r"(v_buf),  // %2
1952      "r"(rgb_buf),  // %3
1953      "r"(width),  // %4
1954      "r" (kCoefficientsRgbY),  // %5
1955      "r"(static_cast<long>(source_dx))  // %6
1956    : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1957  );
1958  }
1959
1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
1962
1963  // PIC version is slower because less registers are available, so
1964  // non-PIC is used on platforms where it is possible.
1965 -
1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1967 -                              const uint8* u_buf,
1968 -                              const uint8* v_buf,
1969 -                              uint8* rgb_buf,
1970 -                              int width);
1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
1972 +                                  const uint8* u_buf,
1973 +                                  const uint8* v_buf,
1974 +                                  uint8* rgb_buf,
1975 +                                  int width);
1976    asm(
1977    ".text\n"
1978 -  ".global FastConvertYUVToRGB32Row\n"
1979 -"FastConvertYUVToRGB32Row:\n"
1980 +  ".global FastConvertYUVToRGB32Row_SSE\n"
1981 +  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
1982 +"FastConvertYUVToRGB32Row_SSE:\n"
1983    "pusha\n"
1984    "mov    0x24(%esp),%edx\n"
1985    "mov    0x28(%esp),%edi\n"
1986    "mov    0x2c(%esp),%esi\n"
1987    "mov    0x30(%esp),%ebp\n"
1988    "mov    0x34(%esp),%ecx\n"
1989 -  "jmp    convertend\n"
1990 -
1991 -"convertloop:"
1992 +  "jmp    1f\n"
1993 +
1994 +"0:"
1995    "movzbl (%edi),%eax\n"
1996    "add    $0x1,%edi\n"
1997    "movzbl (%esi),%ebx\n"
1998    "add    $0x1,%esi\n"
1999    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2000    "movzbl (%edx),%eax\n"
2001    "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2002    "movzbl 0x1(%edx),%ebx\n"
2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2004    "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
2005    "paddsw %mm0,%mm1\n"
2006    "paddsw %mm0,%mm2\n"
2007    "psraw  $0x6,%mm1\n"
2008    "psraw  $0x6,%mm2\n"
2009    "packuswb %mm2,%mm1\n"
2010    "movntq %mm1,0x0(%ebp)\n"
2011    "add    $0x8,%ebp\n"
2012 -"convertend:"
2013 +"1:"
2014    "sub    $0x2,%ecx\n"
2015 -  "jns    convertloop\n"
2016 +  "jns    0b\n"
2017
2018    "and    $0x1,%ecx\n"
2019 -  "je     convertdone\n"
2020 +  "je     2f\n"
2021
2022    "movzbl (%edi),%eax\n"
2023    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2024    "movzbl (%esi),%eax\n"
2025    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2026    "movzbl (%edx),%eax\n"
2027    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
2028    "paddsw %mm0,%mm1\n"
2029    "psraw  $0x6,%mm1\n"
2030    "packuswb %mm1,%mm1\n"
2031    "movd   %mm1,0x0(%ebp)\n"
2032 -"convertdone:"
2033 +"2:"
2034    "popa\n"
2035    "ret\n"
2036 +#if !defined(XP_MACOSX)
2037 +  ".previous\n"
2038 +#endif
2039  );
2040
2041 -
2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2043 -                        const uint8* u_buf,
2044 -                        const uint8* v_buf,
2045 -                        uint8* rgb_buf,
2046 -                        int width,
2047 -                        int source_dx);
2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2049 +                              const uint8* u_buf,
2050 +                              const uint8* v_buf,
2051 +                              uint8* rgb_buf,
2052 +                              int width)
2053 +{
2054 +  if (mozilla::supports_sse()) {
2055 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2056 +    return;
2057 +  }
2058 +
2059 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2060 +}
2061 +
2062 +
2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2064 +                            const uint8* u_buf,
2065 +                            const uint8* v_buf,
2066 +                            uint8* rgb_buf,
2067 +                            int width,
2068 +                            int source_dx);
2069    asm(
2070    ".text\n"
2071 -  ".global ScaleYUVToRGB32Row\n"
2072 -"ScaleYUVToRGB32Row:\n"
2073 +  ".global ScaleYUVToRGB32Row_SSE\n"
2074 +  ".type ScaleYUVToRGB32Row_SSE, @function\n"
2075 +"ScaleYUVToRGB32Row_SSE:\n"
2076    "pusha\n"
2077    "mov    0x24(%esp),%edx\n"
2078    "mov    0x28(%esp),%edi\n"
2079    "mov    0x2c(%esp),%esi\n"
2080    "mov    0x30(%esp),%ebp\n"
2081    "mov    0x34(%esp),%ecx\n"
2082    "xor    %ebx,%ebx\n"
2083 -  "jmp    scaleend\n"
2084 -
2085 -"scaleloop:"
2086 +  "jmp    1f\n"
2087 +
2088 +"0:"
2089    "mov    %ebx,%eax\n"
2090    "sar    $0x11,%eax\n"
2091    "movzbl (%edi,%eax,1),%eax\n"
2092    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2093    "mov    %ebx,%eax\n"
2094    "sar    $0x11,%eax\n"
2095    "movzbl (%esi,%eax,1),%eax\n"
2096    "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2098    "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
2099    "paddsw %mm0,%mm1\n"
2100    "paddsw %mm0,%mm2\n"
2101    "psraw  $0x6,%mm1\n"
2102    "psraw  $0x6,%mm2\n"
2103    "packuswb %mm2,%mm1\n"
2104    "movntq %mm1,0x0(%ebp)\n"
2105    "add    $0x8,%ebp\n"
2106 -"scaleend:"
2107 +"1:"
2108    "sub    $0x2,%ecx\n"
2109 -  "jns    scaleloop\n"
2110 +  "jns    0b\n"
2111
2112    "and    $0x1,%ecx\n"
2113 -  "je     scaledone\n"
2114 +  "je     2f\n"
2115
2116    "mov    %ebx,%eax\n"
2117    "sar    $0x11,%eax\n"
2118    "movzbl (%edi,%eax,1),%eax\n"
2119    "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2120    "mov    %ebx,%eax\n"
2121    "sar    $0x11,%eax\n"
2122    "movzbl (%esi,%eax,1),%eax\n"
2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2124    "sar    $0x10,%eax\n"
2125    "movzbl (%edx,%eax,1),%eax\n"
2126    "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
2127    "paddsw %mm0,%mm1\n"
2128    "psraw  $0x6,%mm1\n"
2129    "packuswb %mm1,%mm1\n"
2130    "movd   %mm1,0x0(%ebp)\n"
2131
2132 -"scaledone:"
2133 +"2:"
2134    "popa\n"
2135    "ret\n"
2136 +#if !defined(XP_MACOSX)
2137 +  ".previous\n"
2138 +#endif
2139  );
2140
2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2142 -                              const uint8* u_buf,
2143 -                              const uint8* v_buf,
2144 -                              uint8* rgb_buf,
2145 -                              int width,
2146 -                              int source_dx);
2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2148 +                        const uint8* u_buf,
2149 +                        const uint8* v_buf,
2150 +                        uint8* rgb_buf,
2151 +                        int width,
2152 +                        int source_dx)
2153 +{
2154 +  if (mozilla::supports_sse()) {
2155 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2156 +                           width, source_dx);
2157 +  }
2158 +
2159 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2160 +                       width, source_dx);
2161 +}
2162 +
2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2164 +                                  const uint8* u_buf,
2165 +                                  const uint8* v_buf,
2166 +                                  uint8* rgb_buf,
2167 +                                  int width,
2168 +                                  int source_dx);
2169    asm(
2170    ".text\n"
2171 -  ".global LinearScaleYUVToRGB32Row\n"
2172 -"LinearScaleYUVToRGB32Row:\n"
2173 +  ".global LinearScaleYUVToRGB32Row_SSE\n"
2174 +  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2175 +"LinearScaleYUVToRGB32Row_SSE:\n"
2176    "pusha\n"
2177    "mov    0x24(%esp),%edx\n"
2178    "mov    0x28(%esp),%edi\n"
2179    "mov    0x30(%esp),%ebp\n"
2180
2181    // source_width = width * source_dx + ebx
2182    "mov    0x34(%esp), %ecx\n"
2183    "imull  0x38(%esp), %ecx\n"
2184    "mov    %ecx, 0x34(%esp)\n"
2185
2186    "mov    0x38(%esp), %ecx\n"
2187    "xor    %ebx,%ebx\n"     // x = 0
2188    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
2189 -  "jl     .lscaleend\n"
2190 +  "jl     1f\n"
2191    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
2192 -  "jmp    .lscaleend\n"
2193 -
2194 -".lscaleloop:"
2195 -  "mov    %ebx,%eax\n"
2196 -  "sar    $0x11,%eax\n"
2197 +  "jmp    1f\n"
2198 +
2199 +"0:"
2200 +  "mov    %ebx,%eax\n"
2201 +  "sar    $0x11,%eax\n"
2202
2203    "movzbl (%edi,%eax,1),%ecx\n"
2204    "movzbl 1(%edi,%eax,1),%esi\n"
2205    "mov    %ebx,%eax\n"
2206    "andl   $0x1fffe, %eax \n"
2207    "imul   %eax, %esi \n"
2208    "xorl   $0x1fffe, %eax \n"
2209    "imul   %eax, %ecx \n"
2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2211    "imul   %eax, %esi \n"
2212    "xorl   $0xffff, %eax \n"
2213    "imul   %eax, %ecx \n"
2214    "addl   %esi, %ecx \n"
2215    "shrl   $16, %ecx \n"
2216    "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
2217
2218    "cmp    0x34(%esp), %ebx\n"
2219 -  "jge    .lscalelastpixel\n"
2220 +  "jge    2f\n"
2221
2222    "mov    %ebx,%eax\n"
2223    "sar    $0x10,%eax\n"
2224    "movzbl (%edx,%eax,1),%ecx\n"
2225    "movzbl 1(%edx,%eax,1),%esi\n"
2226    "mov    %ebx,%eax\n"
2227    "add    0x38(%esp),%ebx\n"
2228    "andl   $0xffff, %eax \n"
2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2230    "paddsw %mm0,%mm1\n"
2231    "paddsw %mm0,%mm2\n"
2232    "psraw  $0x6,%mm1\n"
2233    "psraw  $0x6,%mm2\n"
2234    "packuswb %mm2,%mm1\n"
2235    "movntq %mm1,0x0(%ebp)\n"
2236    "add    $0x8,%ebp\n"
2237
2238 -".lscaleend:"
2239 +"1:"
2240    "cmp    0x34(%esp), %ebx\n"
2241 -  "jl     .lscaleloop\n"
2242 +  "jl     0b\n"
2243    "popa\n"
2244    "ret\n"
2245
2246 -".lscalelastpixel:"
2247 +"2:"
2248    "paddsw %mm0, %mm1\n"
2249    "psraw $6, %mm1\n"
2250    "packuswb %mm1, %mm1\n"
2251    "movd %mm1, (%ebp)\n"
2252    "popa\n"
2253    "ret\n"
2254 +#if !defined(XP_MACOSX)
2255 +  ".previous\n"
2256 +#endif
2257  );
2258
2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2260 -
2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2262 -                                    const uint8* u_buf,
2263 -                                    const uint8* v_buf,
2264 -                                    uint8* rgb_buf,
2265 -                                    int width,
2266 -                                    int16 *kCoefficientsRgbY);
2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2268 +                              const uint8* u_buf,
2269 +                              const uint8* v_buf,
2270 +                              uint8* rgb_buf,
2271 +                              int width,
2272 +                              int source_dx)
2273 +{
2274 +  if (mozilla::supports_sse()) {
2275 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2276 +                                 width, source_dx);
2277 +  }
2278 +
2279 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2280 +                             width, source_dx);
2281 +}
2282 +
2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2284 +
2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2286 +                                 const uint8* u_buf,
2287 +                                 const uint8* v_buf,
2288 +                                 uint8* rgb_buf,
2289 +                                 int width,
2290 +                                 int16 *kCoefficientsRgbY);
2291 +
2292    asm(
2293    ".text\n"
2294 -#if defined(OS_MACOSX)
2295 -"_PICConvertYUVToRGB32Row:\n"
2296 +#if defined(XP_MACOSX)
2297 +"_PICConvertYUVToRGB32Row_SSE:\n"
2298  #else
2299 -"PICConvertYUVToRGB32Row:\n"
2300 +"PICConvertYUVToRGB32Row_SSE:\n"
2301  #endif
2302    "pusha\n"
2303    "mov    0x24(%esp),%edx\n"
2304    "mov    0x28(%esp),%edi\n"
2305    "mov    0x2c(%esp),%esi\n"
2306    "mov    0x30(%esp),%ebp\n"
2307    "mov    0x38(%esp),%ecx\n"
2308
2309 -  "jmp    .Lconvertend\n"
2310 -
2311 -".Lconvertloop:"
2312 +  "jmp    1f\n"
2313 +
2314 +"0:"
2315    "movzbl (%edi),%eax\n"
2316    "add    $0x1,%edi\n"
2317    "movzbl (%esi),%ebx\n"
2318    "add    $0x1,%esi\n"
2319    "movq   2048(%ecx,%eax,8),%mm0\n"
2320    "movzbl (%edx),%eax\n"
2321    "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2322    "movzbl 0x1(%edx),%ebx\n"
2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2324    "movq   0(%ecx,%ebx,8),%mm2\n"
2325    "paddsw %mm0,%mm1\n"
2326    "paddsw %mm0,%mm2\n"
2327    "psraw  $0x6,%mm1\n"
2328    "psraw  $0x6,%mm2\n"
2329    "packuswb %mm2,%mm1\n"
2330    "movntq %mm1,0x0(%ebp)\n"
2331    "add    $0x8,%ebp\n"
2332 -".Lconvertend:"
2333 +"1:"
2334    "subl   $0x2,0x34(%esp)\n"
2335 -  "jns    .Lconvertloop\n"
2336 +  "jns    0b\n"
2337
2338    "andl   $0x1,0x34(%esp)\n"
2339 -  "je     .Lconvertdone\n"
2340 +  "je     2f\n"
2341
2342    "movzbl (%edi),%eax\n"
2343    "movq   2048(%ecx,%eax,8),%mm0\n"
2344    "movzbl (%esi),%eax\n"
2345    "paddsw 4096(%ecx,%eax,8),%mm0\n"
2346    "movzbl (%edx),%eax\n"
2347    "movq   0(%ecx,%eax,8),%mm1\n"
2348    "paddsw %mm0,%mm1\n"
2349    "psraw  $0x6,%mm1\n"
2350    "packuswb %mm1,%mm1\n"
2351    "movd   %mm1,0x0(%ebp)\n"
2352 -".Lconvertdone:\n"
2353 +"2:"
2354    "popa\n"
2355    "ret\n"
2356 +#if !defined(XP_MACOSX)
2357 +  ".previous\n"
2358 +#endif
2359  );
2360
2361  void FastConvertYUVToRGB32Row(const uint8* y_buf,
2362                                const uint8* u_buf,
2363                                const uint8* v_buf,
2364                                uint8* rgb_buf,
2365 -                              int width) {
2366 -  PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2367 -                          &kCoefficientsRgbY[0][0]);
2368 -}
2369 -
2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2371 +                              int width)
2372 +{
2373 +  if (mozilla::supports_sse()) {
2374 +    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2375 +                                &kCoefficientsRgbY[0][0]);
2376 +    return;
2377 +  }
2378 +
2379 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2380 +}
2381 +
2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2383                                 const uint8* u_buf,
2384                                 const uint8* v_buf,
2385                                 uint8* rgb_buf,
2386                                 int width,
2387                                 int source_dx,
2388                                 int16 *kCoefficientsRgbY);
2389
2390    asm(
2391    ".text\n"
2392 -#if defined(OS_MACOSX)
2393 -"_PICScaleYUVToRGB32Row:\n"
2394 +#if defined(XP_MACOSX)
2395 +"_PICScaleYUVToRGB32Row_SSE:\n"
2396  #else
2397 -"PICScaleYUVToRGB32Row:\n"
2398 +"PICScaleYUVToRGB32Row_SSE:\n"
2399  #endif
2400    "pusha\n"
2401    "mov    0x24(%esp),%edx\n"
2402    "mov    0x28(%esp),%edi\n"
2403    "mov    0x2c(%esp),%esi\n"
2404    "mov    0x30(%esp),%ebp\n"
2405    "mov    0x3c(%esp),%ecx\n"
2406    "xor    %ebx,%ebx\n"
2407 -  "jmp    Lscaleend\n"
2408 -
2409 -"Lscaleloop:"
2410 +  "jmp    1f\n"
2411 +
2412 +"0:"
2413    "mov    %ebx,%eax\n"
2414    "sar    $0x11,%eax\n"
2415    "movzbl (%edi,%eax,1),%eax\n"
2416    "movq   2048(%ecx,%eax,8),%mm0\n"
2417    "mov    %ebx,%eax\n"
2418    "sar    $0x11,%eax\n"
2419    "movzbl (%esi,%eax,1),%eax\n"
2420    "paddsw 4096(%ecx,%eax,8),%mm0\n"
2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2422    "movq   0(%ecx,%eax,8),%mm2\n"
2423    "paddsw %mm0,%mm1\n"
2424    "paddsw %mm0,%mm2\n"
2425    "psraw  $0x6,%mm1\n"
2426    "psraw  $0x6,%mm2\n"
2427    "packuswb %mm2,%mm1\n"
2428    "movntq %mm1,0x0(%ebp)\n"
2429    "add    $0x8,%ebp\n"
2430 -"Lscaleend:"
2431 +"1:"
2432    "subl   $0x2,0x34(%esp)\n"
2433 -  "jns    Lscaleloop\n"
2434 +  "jns    0b\n"
2435
2436    "andl   $0x1,0x34(%esp)\n"
2437 -  "je     Lscaledone\n"
2438 +  "je     2f\n"
2439
2440    "mov    %ebx,%eax\n"
2441    "sar    $0x11,%eax\n"
2442    "movzbl (%edi,%eax,1),%eax\n"
2443    "movq   2048(%ecx,%eax,8),%mm0\n"
2444    "mov    %ebx,%eax\n"
2445    "sar    $0x11,%eax\n"
2446    "movzbl (%esi,%eax,1),%eax\n"
2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2448    "sar    $0x10,%eax\n"
2449    "movzbl (%edx,%eax,1),%eax\n"
2450    "movq   0(%ecx,%eax,8),%mm1\n"
2451    "paddsw %mm0,%mm1\n"
2452    "psraw  $0x6,%mm1\n"
2453    "packuswb %mm1,%mm1\n"
2454    "movd   %mm1,0x0(%ebp)\n"
2455
2456 -"Lscaledone:"
2457 +"2:"
2458    "popa\n"
2459    "ret\n"
2460 +#if !defined(XP_MACOSX)
2461 +  ".previous\n"
2462 +#endif
2463  );
2464
2465 -
2466  void ScaleYUVToRGB32Row(const uint8* y_buf,
2467                          const uint8* u_buf,
2468                          const uint8* v_buf,
2469                          uint8* rgb_buf,
2470                          int width,
2471 -                        int source_dx) {
2472 -  PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2473 -                        &kCoefficientsRgbY[0][0]);
2474 -}
2475 -
2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2477 -                                 const uint8* u_buf,
2478 -                                 const uint8* v_buf,
2479 -                                 uint8* rgb_buf,
2480 -                                 int width,
2481 -                                 int source_dx,
2482 -                                 int16 *kCoefficientsRgbY);
2483 +                        int source_dx)
2484 +{
2485 +  if (mozilla::supports_sse()) {
2486 +    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2487 +                              &kCoefficientsRgbY[0][0]);
2488 +    return;
2489 +  }
2490 +
2491 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2492 +}
2493 +
2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2495 +                                     const uint8* u_buf,
2496 +                                     const uint8* v_buf,
2497 +                                     uint8* rgb_buf,
2498 +                                     int width,
2499 +                                     int source_dx,
2500 +                                     int16 *kCoefficientsRgbY);
2501 +
2502    asm(
2503    ".text\n"
2504 -#if defined(OS_MACOSX)
2505 -"_PICLinearScaleYUVToRGB32Row:\n"
2506 +#if defined(XP_MACOSX)
2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2508  #else
2509 -"PICLinearScaleYUVToRGB32Row:\n"
2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2511  #endif
2512    "pusha\n"
2513    "mov    0x24(%esp),%edx\n"
2514    "mov    0x30(%esp),%ebp\n"
2515    "mov    0x34(%esp),%ecx\n"
2516    "mov    0x3c(%esp),%edi\n"
2517    "xor    %ebx,%ebx\n"
2518
2519    // source_width = width * source_dx + ebx
2520    "mov    0x34(%esp), %ecx\n"
2521    "imull  0x38(%esp), %ecx\n"
2522    "mov    %ecx, 0x34(%esp)\n"
2523
2524    "mov    0x38(%esp), %ecx\n"
2525    "xor    %ebx,%ebx\n"     // x = 0
2526    "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
2527 -  "jl     .lscaleend\n"
2528 +  "jl     1f\n"
2529    "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
2530 -  "jmp    .lscaleend\n"
2531 -
2532 -".lscaleloop:"
2533 +  "jmp    1f\n"
2534 +
2535 +"0:"
2536    "mov    0x28(%esp),%esi\n"
2537    "mov    %ebx,%eax\n"
2538    "sar    $0x11,%eax\n"
2539
2540    "movzbl (%esi,%eax,1),%ecx\n"
2541    "movzbl 1(%esi,%eax,1),%esi\n"
2542    "mov    %ebx,%eax\n"
2543    "andl   $0x1fffe, %eax \n"
2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2545    "imul   %eax, %esi \n"
2546    "xorl   $0xffff, %eax \n"
2547    "imul   %eax, %ecx \n"
2548    "addl   %esi, %ecx \n"
2549    "shrl   $16, %ecx \n"
2550    "movq   (%edi,%ecx,8),%mm1\n"
2551
2552    "cmp    0x34(%esp), %ebx\n"
2553 -  "jge    .lscalelastpixel\n"
2554 +  "jge    2f\n"
2555
2556    "mov    %ebx,%eax\n"
2557    "sar    $0x10,%eax\n"
2558    "movzbl (%edx,%eax,1),%ecx\n"
2559    "movzbl 1(%edx,%eax,1),%esi\n"
2560    "mov    %ebx,%eax\n"
2561    "add    0x38(%esp),%ebx\n"
2562    "andl   $0xffff, %eax \n"
2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2564    "paddsw %mm0,%mm1\n"
2565    "paddsw %mm0,%mm2\n"
2566    "psraw  $0x6,%mm1\n"
2567    "psraw  $0x6,%mm2\n"
2568    "packuswb %mm2,%mm1\n"
2569    "movntq %mm1,0x0(%ebp)\n"
2570    "add    $0x8,%ebp\n"
2571
2572 -".lscaleend:"
2573 +"1:"
2574    "cmp    %ebx, 0x34(%esp)\n"
2575 -  "jg     .lscaleloop\n"
2576 +  "jg     0b\n"
2577    "popa\n"
2578    "ret\n"
2579
2580 -".lscalelastpixel:"
2581 +"2:"
2582    "paddsw %mm0, %mm1\n"
2583    "psraw $6, %mm1\n"
2584    "packuswb %mm1, %mm1\n"
2585    "movd %mm1, (%ebp)\n"
2586    "popa\n"
2587    "ret\n"
2588 +#if !defined(XP_MACOSX)
2589 +  ".previous\n"
2590 +#endif
2591  );
2592
2593 +
2594  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2595 -                        const uint8* u_buf,
2596 -                        const uint8* v_buf,
2597 -                        uint8* rgb_buf,
2598 -                        int width,
2599 -                        int source_dx) {
2600 -  PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2601 -                              &kCoefficientsRgbY[0][0]);
2602 -}
2603 -
2604 -#else  // USE_MMX
2605 -
2606 -// C reference code that mimic the YUV assembly.
2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2609 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2610 -
2611 -static inline void YuvPixel(uint8 y,
2612 -                            uint8 u,
2613 -                            uint8 v,
2614 -                            uint8* rgb_buf) {
2615 -
2616 -  int b = kCoefficientsRgbY[256+u][0];
2617 -  int g = kCoefficientsRgbY[256+u][1];
2618 -  int r = kCoefficientsRgbY[256+u][2];
2619 -  int a = kCoefficientsRgbY[256+u][3];
2620 -
2621 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2622 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2623 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2624 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2625 -
2626 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
2627 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
2628 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
2629 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
2630 -
2631 -  b >>= 6;
2632 -  g >>= 6;
2633 -  r >>= 6;
2634 -  a >>= 6;
2635 -
2636 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2637 -                                        (packuswb(g) << 8) |
2638 -                                        (packuswb(r) << 16) |
2639 -                                        (packuswb(a) << 24);
2640 -}
2641 -
2642 +                              const uint8* u_buf,
2643 +                              const uint8* v_buf,
2644 +                              uint8* rgb_buf,
2645 +                              int width,
2646 +                              int source_dx)
2647 +{
2648 +  if (mozilla::supports_sse()) {
2649 +    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2650 +                                    source_dx, &kCoefficientsRgbY[0][0]);
2651 +    return;
2652 +  }
2653 +
2654 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2655 +}
2656 +#else
2657  void FastConvertYUVToRGB32Row(const uint8* y_buf,
2658                                const uint8* u_buf,
2659                                const uint8* v_buf,
2660                                uint8* rgb_buf,
2661                                int width) {
2662 -  for (int x = 0; x < width; x += 2) {
2663 -    uint8 u = u_buf[x >> 1];
2664 -    uint8 v = v_buf[x >> 1];
2665 -    uint8 y0 = y_buf[x];
2666 -    YuvPixel(y0, u, v, rgb_buf);
2667 -    if ((x + 1) < width) {
2668 -      uint8 y1 = y_buf[x + 1];
2669 -      YuvPixel(y1, u, v, rgb_buf + 4);
2670 -    }
2671 -    rgb_buf += 8;  // Advance 2 pixels.
2672 -  }
2673 -}
2674 -
2675 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
2676 -// A shift by 17 is used to further subsample the chrominence channels.
2677 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
2678 -// for 1/65536 pixel accurate interpolation.
2679 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2680 +}
2681 +
2682  void ScaleYUVToRGB32Row(const uint8* y_buf,
2683                          const uint8* u_buf,
2684                          const uint8* v_buf,
2685                          uint8* rgb_buf,
2686                          int width,
2687                          int source_dx) {
2688 -  int x = 0;
2689 -  for (int i = 0; i < width; i += 2) {
2690 -    int y = y_buf[x >> 16];
2691 -    int u = u_buf[(x >> 17)];
2692 -    int v = v_buf[(x >> 17)];
2693 -    YuvPixel(y, u, v, rgb_buf);
2694 -    x += source_dx;
2695 -    if ((i + 1) < width) {
2696 -      y = y_buf[x >> 16];
2697 -      YuvPixel(y, u, v, rgb_buf+4);
2698 -      x += source_dx;
2699 -    }
2700 -    rgb_buf += 8;
2701 -  }
2702 -}
2703 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2704 +}
2705
2706  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2707                                const uint8* u_buf,
2708                                const uint8* v_buf,
2709                                uint8* rgb_buf,
2710                                int width,
2711                                int source_dx) {
2712 -  int x = 0;
2713 -  if (source_dx >= 0x20000) {
2714 -    x = 32768;
2715 -  }
2716 -  for (int i = 0; i < width; i += 2) {
2717 -    int y0 = y_buf[x >> 16];
2718 -    int y1 = y_buf[(x >> 16) + 1];
2719 -    int u0 = u_buf[(x >> 17)];
2720 -    int u1 = u_buf[(x >> 17) + 1];
2721 -    int v0 = v_buf[(x >> 17)];
2722 -    int v1 = v_buf[(x >> 17) + 1];
2723 -    int y_frac = (x & 65535);
2724 -    int uv_frac = ((x >> 1) & 65535);
2725 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2726 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2727 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2728 -    YuvPixel(y, u, v, rgb_buf);
2729 -    x += source_dx;
2730 -    if ((i + 1) < width) {
2731 -      y0 = y_buf[x >> 16];
2732 -      y1 = y_buf[(x >> 16) + 1];
2733 -      y_frac = (x & 65535);
2734 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2735 -      YuvPixel(y, u, v, rgb_buf+4);
2736 -      x += source_dx;
2737 -    }
2738 -    rgb_buf += 8;
2739 -  }
2740 -}
2741 -
2742 -#endif  // USE_MMX
2743 -}  // extern "C"
2744 -
2745 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2746 +}
2747 +#endif
2748 +
2749 +}
2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2751 --- a/gfx/ycbcr/yuv_row_table.cpp
2752 +++ b/gfx/ycbcr/yuv_row_table.cpp
2753 @@ -1,13 +1,13 @@
2754  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2755  // Use of this source code is governed by a BSD-style license that can be
2756  // found in the LICENSE file.
2757
2758 -#include "media/base/yuv_row.h"
2759 +#include "yuv_row.h"
2760
2761  extern "C" {
2762
2763  #define RGBY(i) { \
2764    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2765    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2766    static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2767    0 \
2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2769 --- a/gfx/ycbcr/yuv_row_win.cpp
2770 +++ b/gfx/ycbcr/yuv_row_win.cpp
2771 @@ -1,26 +1,27 @@
2772  // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2773  // Use of this source code is governed by a BSD-style license that can be
2774  // found in the LICENSE file.
2775
2776 -#include "media/base/yuv_row.h"
2777 +#include "yuv_row.h"
2778 +#include "mozilla/SSE.h"
2779
2780  #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2781  #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2782
2783  extern "C" {
2784
2785 -#if USE_MMX
2786 -__declspec(naked)
2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2788 -                              const uint8* u_buf,
2789 -                              const uint8* v_buf,
2790 -                              uint8* rgb_buf,
2791 -                              int width) {
2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2793 +__declspec(naked)
2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2795 +                                  const uint8* u_buf,
2796 +                                  const uint8* v_buf,
2797 +                                  uint8* rgb_buf,
2798 +                                  int width) {
2799    __asm {
2800      pushad
2801      mov       edx, [esp + 32 + 4]   // Y
2802      mov       edi, [esp + 32 + 8]   // U
2803      mov       esi, [esp + 32 + 12]  // V
2804      mov       ebp, [esp + 32 + 16]  // rgb
2805      mov       ecx, [esp + 32 + 20]  // width
2806      jmp       convertend
2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2808   convertdone :
2809
2810      popad
2811      ret
2812    }
2813  }
2814
2815  __declspec(naked)
2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2817 -                          const uint8* u_buf,
2818 -                          const uint8* v_buf,
2819 -                          uint8* rgb_buf,
2820 -                          int width,
2821 -                          int step) {
2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2823 +                              const uint8* u_buf,
2824 +                              const uint8* v_buf,
2825 +                              uint8* rgb_buf,
2826 +                              int width,
2827 +                              int step) {
2828    __asm {
2829      pushad
2830      mov       edx, [esp + 32 + 4]   // Y
2831      mov       edi, [esp + 32 + 8]   // U
2832      mov       esi, [esp + 32 + 12]  // V
2833      mov       ebp, [esp + 32 + 16]  // rgb
2834      mov       ecx, [esp + 32 + 20]  // width
2835      mov       ebx, [esp + 32 + 24]  // step
2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2837   wdone :
2838
2839      popad
2840      ret
2841    }
2842  }
2843
2844  __declspec(naked)
2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2846 -                                const uint8* u_buf,
2847 -                                const uint8* v_buf,
2848 -                                uint8* rgb_buf,
2849 -                                int width,
2850 -                                int ystep,
2851 -                                int uvstep) {
2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2853 +                                    const uint8* u_buf,
2854 +                                    const uint8* v_buf,
2855 +                                    uint8* rgb_buf,
2856 +                                    int width,
2857 +                                    int ystep,
2858 +                                    int uvstep) {
2859    __asm {
2860      pushad
2861      mov       edx, [esp + 32 + 4]   // Y
2862      mov       edi, [esp + 32 + 8]   // U
2863      mov       esi, [esp + 32 + 12]  // V
2864      mov       ebp, [esp + 32 + 16]  // rgb
2865      mov       ecx, [esp + 32 + 20]  // width
2866      jmp       wend
2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2868   wdone :
2869
2870      popad
2871      ret
2872    }
2873  }
2874
2875  __declspec(naked)
2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2877 -                         const uint8* u_buf,
2878 -                         const uint8* v_buf,
2879 -                         uint8* rgb_buf,
2880 -                         int width) {
2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2882 +                             const uint8* u_buf,
2883 +                             const uint8* v_buf,
2884 +                             uint8* rgb_buf,
2885 +                             int width) {
2886    __asm {
2887      pushad
2888      mov       edx, [esp + 32 + 4]   // Y
2889      mov       edi, [esp + 32 + 8]   // U
2890      mov       esi, [esp + 32 + 12]  // V
2891      mov       ebp, [esp + 32 + 16]  // rgb
2892      mov       ecx, [esp + 32 + 20]  // width
2893      jmp       wend
2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2895      jns       wloop1
2896   wdone :
2897      popad
2898      ret
2899    }
2900  }
2901
2902  // This version does general purpose scaling by any amount, up or down.
2903 -// The only thing it can not do it rotation by 90 or 270.
2904 -// For performance the chroma is under sampled, reducing cost of a 3x
2905 +// The only thing it cannot do is rotation by 90 or 270.
2906 +// For performance the chroma is under-sampled, reducing cost of a 3x
2907  // 1080p scale from 8.4 ms to 5.4 ms.
2908  __declspec(naked)
2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2910 -                        const uint8* u_buf,
2911 -                        const uint8* v_buf,
2912 -                        uint8* rgb_buf,
2913 -                        int width,
2914 -                        int source_dx) {
2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2916 +                            const uint8* u_buf,
2917 +                            const uint8* v_buf,
2918 +                            uint8* rgb_buf,
2919 +                            int width,
2920 +                            int source_dx) {
2921    __asm {
2922      pushad
2923      mov       edx, [esp + 32 + 4]   // Y
2924      mov       edi, [esp + 32 + 8]   // U
2925      mov       esi, [esp + 32 + 12]  // V
2926      mov       ebp, [esp + 32 + 16]  // rgb
2927      mov       ecx, [esp + 32 + 20]  // width
2928      xor       ebx, ebx              // x
2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2930
2931   scaledone :
2932      popad
2933      ret
2934    }
2935  }
2936
2937  __declspec(naked)
2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2939 -                              const uint8* u_buf,
2940 -                              const uint8* v_buf,
2941 -                              uint8* rgb_buf,
2942 -                              int width,
2943 -                              int source_dx) {
2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2945 +                                  const uint8* u_buf,
2946 +                                  const uint8* v_buf,
2947 +                                  uint8* rgb_buf,
2948 +                                  int width,
2949 +                                  int source_dx) {
2950    __asm {
2951      pushad
2952      mov       edx, [esp + 32 + 4]  // Y
2953      mov       edi, [esp + 32 + 8]  // U
2954                  // [esp + 32 + 12] // V
2955      mov       ebp, [esp + 32 + 16] // rgb
2956      mov       ecx, [esp + 32 + 20] // width
2957      imul      ecx, [esp + 32 + 24] // source_dx
2958 @@ -438,152 +439,60 @@ lscalelastpixel:
2959      paddsw    mm1, mm0
2960      psraw     mm1, 6
2961      packuswb  mm1, mm1
2962      movd      [ebp], mm1
2963      popad
2964      ret
2965    };
2966  }
2967 -#else  // USE_MMX
2968 -
2969 -// C reference code that mimic the YUV assembly.
2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2972 -    (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2973 -
2974 -static inline void YuvPixel(uint8 y,
2975 -                            uint8 u,
2976 -                            uint8 v,
2977 -                            uint8* rgb_buf) {
2978 -
2979 -  int b = kCoefficientsRgbY[256+u][0];
2980 -  int g = kCoefficientsRgbY[256+u][1];
2981 -  int r = kCoefficientsRgbY[256+u][2];
2982 -  int a = kCoefficientsRgbY[256+u][3];
2983 -
2984 -  b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2985 -  g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2986 -  r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2987 -  a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2988 -
2989 -  b = paddsw(b, kCoefficientsRgbY[y][0]);
2990 -  g = paddsw(g, kCoefficientsRgbY[y][1]);
2991 -  r = paddsw(r, kCoefficientsRgbY[y][2]);
2992 -  a = paddsw(a, kCoefficientsRgbY[y][3]);
2993 -
2994 -  b >>= 6;
2995 -  g >>= 6;
2996 -  r >>= 6;
2997 -  a >>= 6;
2998 -
2999 -  *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3000 -                                        (packuswb(g) << 8) |
3001 -                                        (packuswb(r) << 16) |
3002 -                                        (packuswb(a) << 24);
3003 -}
3004 -
3005 -#if TEST_MMX_YUV
3006 -static inline void YuvPixel(uint8 y,
3007 -                            uint8 u,
3008 -                            uint8 v,
3009 -                            uint8* rgb_buf) {
3010 -
3011 -  __asm {
3012 -    movzx     eax, u
3013 -    movq      mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3014 -    movzx     eax, v
3015 -    paddsw    mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3016 -    movzx     eax, y
3017 -    movq      mm1, [kCoefficientsRgbY + 8 * eax]
3018 -    paddsw    mm1, mm0
3019 -    psraw     mm1, 6
3020 -    packuswb  mm1, mm1
3021 -    mov       eax, rgb_buf
3022 -    movd      [eax], mm1
3023 -    emms
3024 -  }
3025 -}
3026 -#endif
3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3028
3029  void FastConvertYUVToRGB32Row(const uint8* y_buf,
3030                                const uint8* u_buf,
3031                                const uint8* v_buf,
3032                                uint8* rgb_buf,
3033                                int width) {
3034 -  for (int x = 0; x < width; x += 2) {
3035 -    uint8 u = u_buf[x >> 1];
3036 -    uint8 v = v_buf[x >> 1];
3037 -    uint8 y0 = y_buf[x];
3038 -    YuvPixel(y0, u, v, rgb_buf);
3039 -    if ((x + 1) < width) {
3040 -      uint8 y1 = y_buf[x + 1];
3041 -      YuvPixel(y1, u, v, rgb_buf + 4);
3042 -    }
3043 -    rgb_buf += 8;  // Advance 2 pixels.
3044 -  }
3045 -}
3046 -
3047 -// 16.16 fixed point is used.  A shift by 16 isolates the integer.
3048 -// A shift by 17 is used to further subsample the chrominence channels.
3049 -// & 0xffff isolates the fixed point fraction.  >> 2 to get the upper 2 bits,
3050 -// for 1/65536 pixel accurate interpolation.
3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3052 +  if (mozilla::supports_sse()) {
3053 +    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3054 +    return;
3055 +  }
3056 +#endif
3057 +
3058 +  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3059 +}
3060 +
3061  void ScaleYUVToRGB32Row(const uint8* y_buf,
3062                          const uint8* u_buf,
3063                          const uint8* v_buf,
3064                          uint8* rgb_buf,
3065                          int width,
3066                          int source_dx) {
3067 -  int x = 0;
3068 -  for (int i = 0; i < width; i += 2) {
3069 -    int y = y_buf[x >> 16];
3070 -    int u = u_buf[(x >> 17)];
3071 -    int v = v_buf[(x >> 17)];
3072 -    YuvPixel(y, u, v, rgb_buf);
3073 -    x += source_dx;
3074 -    if ((i + 1) < width) {
3075 -      y = y_buf[x >> 16];
3076 -      YuvPixel(y, u, v, rgb_buf+4);
3077 -      x += source_dx;
3078 -    }
3079 -    rgb_buf += 8;
3080 -  }
3081 -}
3082 +
3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3084 +  if (mozilla::supports_sse()) {
3085 +    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3086 +    return;
3087 +  }
3088 +#endif
3089 +
3090 +  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3091 +}
3092
3093  void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3094                                const uint8* u_buf,
3095                                const uint8* v_buf,
3096                                uint8* rgb_buf,
3097                                int width,
3098                                int source_dx) {
3099 -  int x = 0;
3100 -  if (source_dx >= 0x20000) {
3101 -    x = 32768;
3102 -  }
3103 -  for (int i = 0; i < width; i += 2) {
3104 -    int y0 = y_buf[x >> 16];
3105 -    int y1 = y_buf[(x >> 16) + 1];
3106 -    int u0 = u_buf[(x >> 17)];
3107 -    int u1 = u_buf[(x >> 17) + 1];
3108 -    int v0 = v_buf[(x >> 17)];
3109 -    int v1 = v_buf[(x >> 17) + 1];
3110 -    int y_frac = (x & 65535);
3111 -    int uv_frac = ((x >> 1) & 65535);
3112 -    int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3113 -    int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3114 -    int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3115 -    YuvPixel(y, u, v, rgb_buf);
3116 -    x += source_dx;
3117 -    if ((i + 1) < width) {
3118 -      y0 = y_buf[x >> 16];
3119 -      y1 = y_buf[(x >> 16) + 1];
3120 -      y_frac = (x & 65535);
3121 -      y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3122 -      YuvPixel(y, u, v, rgb_buf+4);
3123 -      x += source_dx;
3124 -    }
3125 -    rgb_buf += 8;
3126 -  }
3127 -}
3128 -
3129 -#endif  // USE_MMX
3130 -}  // extern "C"
3131 -
3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3133 +  if (mozilla::supports_sse()) {
3134 +    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3135 +                                 source_dx);
3136 +    return;
3137 +  }
3138 +#endif
3139 +
3140 +  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3141 +}
3142 +
3143 +} // extern "C"