1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
2 --- a/gfx/ycbcr/yuv_convert.cpp
3 +++ b/gfx/ycbcr/yuv_convert.cpp
5 // http://www.fourcc.org/yuv.php
6 // The actual conversion is best described here
7 // http://en.wikipedia.org/wiki/YUV
8 // An article on optimizing YUV conversion using tables instead of multiplies
9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
11 // YV12 is a full plane of Y and a half height, half width chroma planes
12 // YV16 is a full plane of Y and a full height, half width chroma planes
13 +// YV24 is a full plane of Y and a full height, full width chroma planes
15 // ARGB pixel format is output, which on little endian is stored as BGRA.
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
18 -#include "media/base/yuv_convert.h"
19 +#include "yuv_convert.h"
21 // Header for low level row functions.
22 -#include "media/base/yuv_row.h"
25 -#if defined(_MSC_VER)
28 -#include <mmintrin.h>
33 -#include <emmintrin.h>
39 +#include "mozilla/SSE.h"
45 // 16.16 fixed point arithmetic
46 const int kFractionBits = 16;
47 const int kFractionMax = 1 << kFractionBits;
48 const int kFractionMask = ((1 << kFractionBits) - 1);
50 // Convert a frame of YUV to 32 bit ARGB.
51 -void ConvertYUVToRGB32(const uint8* y_buf,
61 - unsigned int y_shift = yuv_type;
62 - for (int y = 0; y < height; ++y) {
63 - uint8* rgb_row = rgb_buf + y * rgb_pitch;
64 - const uint8* y_ptr = y_buf + y * y_pitch;
65 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
66 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
68 - FastConvertYUVToRGB32Row(y_ptr,
74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
86 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
87 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
88 + // Test for SSE because the optimized code uses movntq, which is not part of MMX.
89 + bool has_sse = supports_mmx() && supports_sse();
90 + // There is no optimized YV24 SSE routine so we check for this and
91 + // fall back to the C code.
92 + has_sse &= yuv_type != YV24;
93 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
94 + int x_width = odd_pic_x ? pic_width - 1 : pic_width;
96 + for (int y = pic_y; y < pic_height + pic_y; ++y) {
97 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
98 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
99 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
100 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
103 + // Handle the single odd pixel manually and use the
104 + // fast routines for the remaining.
105 + FastConvertYUVToRGB32Row_C(y_ptr++,
115 + FastConvertYUVToRGB32Row(y_ptr,
122 + FastConvertYUVToRGB32Row_C(y_ptr,
131 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
136 -// FilterRows combines two rows of the image using linear interpolation.
137 -// SSE2 version does 16 pixels at a time
139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
140 - int source_width, int source_y_fraction) {
141 - __m128i zero = _mm_setzero_si128();
142 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
143 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
145 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
146 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
147 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
148 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
151 - __m128i y0 = _mm_loadu_si128(y0_ptr128);
152 - __m128i y1 = _mm_loadu_si128(y1_ptr128);
153 - __m128i y2 = _mm_unpackhi_epi8(y0, zero);
154 - __m128i y3 = _mm_unpackhi_epi8(y1, zero);
155 - y0 = _mm_unpacklo_epi8(y0, zero);
156 - y1 = _mm_unpacklo_epi8(y1, zero);
157 - y0 = _mm_mullo_epi16(y0, y0_fraction);
158 - y1 = _mm_mullo_epi16(y1, y1_fraction);
159 - y2 = _mm_mullo_epi16(y2, y0_fraction);
160 - y3 = _mm_mullo_epi16(y3, y1_fraction);
161 - y0 = _mm_add_epi16(y0, y1);
162 - y2 = _mm_add_epi16(y2, y3);
163 - y0 = _mm_srli_epi16(y0, 8);
164 - y2 = _mm_srli_epi16(y2, 8);
165 - y0 = _mm_packus_epi16(y0, y2);
169 - } while (dest128 < end128);
172 -// MMX version does 8 pixels at a time
173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
174 - int source_width, int source_y_fraction) {
175 - __m64 zero = _mm_setzero_si64();
176 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
177 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
179 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
180 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
181 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
182 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
185 - __m64 y0 = *y0_ptr64++;
186 - __m64 y1 = *y1_ptr64++;
187 - __m64 y2 = _mm_unpackhi_pi8(y0, zero);
188 - __m64 y3 = _mm_unpackhi_pi8(y1, zero);
189 - y0 = _mm_unpacklo_pi8(y0, zero);
190 - y1 = _mm_unpacklo_pi8(y1, zero);
191 - y0 = _mm_mullo_pi16(y0, y0_fraction);
192 - y1 = _mm_mullo_pi16(y1, y1_fraction);
193 - y2 = _mm_mullo_pi16(y2, y0_fraction);
194 - y3 = _mm_mullo_pi16(y3, y1_fraction);
195 - y0 = _mm_add_pi16(y0, y1);
196 - y2 = _mm_add_pi16(y2, y3);
197 - y0 = _mm_srli_pi16(y0, 8);
198 - y2 = _mm_srli_pi16(y2, 8);
199 - y0 = _mm_packs_pu16(y0, y2);
201 - } while (dest64 < end64);
203 -#else // no MMX or SSE2
208 // C version does 8 at a time to mimic MMX code
209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
210 - int source_width, int source_y_fraction) {
211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
212 + int source_width, int source_y_fraction) {
213 int y1_fraction = source_y_fraction;
214 int y0_fraction = 256 - y1_fraction;
215 uint8* end = ybuf + source_width;
217 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
218 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
219 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
220 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
222 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
223 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
224 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
228 } while (ybuf < end);
232 +#ifdef MOZILLA_MAY_SUPPORT_MMX
233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
234 + int source_width, int source_y_fraction);
237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
239 + int source_width, int source_y_fraction);
242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
243 + const uint8* y1_ptr, int source_width,
244 + int source_y_fraction) {
245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
246 + if (mozilla::supports_sse2()) {
247 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
252 +#ifdef MOZILLA_MAY_SUPPORT_MMX
253 + if (mozilla::supports_mmx()) {
254 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
259 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
263 // Scale a frame of YUV to 32 bit ARGB.
264 -void ScaleYUVToRGB32(const uint8* y_buf,
265 - const uint8* u_buf,
266 - const uint8* v_buf,
276 - Rotate view_rotate,
277 - ScaleFilter filter) {
278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
279 + const uint8* u_buf,
280 + const uint8* v_buf,
290 + Rotate view_rotate,
291 + ScaleFilter filter) {
292 + bool has_mmx = supports_mmx();
294 // 4096 allows 3 buffers to fit in 12k.
295 // Helps performance on CPU with 16K L1 cache.
296 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
297 const int kFilterBufferSize = 4096;
298 // Disable filtering if the screen is too big (to avoid buffer overflows).
299 // This should never happen to regular users: they don't have monitors
300 // wider than 4096 pixels.
301 // TODO(fbarchard): Allow rotated videos to filter.
302 if (source_width > kFilterBufferSize || view_rotate)
303 filter = FILTER_NONE;
305 - unsigned int y_shift = yuv_type;
306 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
307 // Diagram showing origin and direction of source sampling.
313 // Rotations that start at right side of image.
314 if ((view_rotate == ROTATE_180) ||
315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
316 int source_uv_fraction =
317 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
319 const uint8* y_ptr = y0_ptr;
320 const uint8* u_ptr = u0_ptr;
321 const uint8* v_ptr = v0_ptr;
322 // Apply vertical filtering if necessary.
323 // TODO(fbarchard): Remove memcpy when not necessary.
324 - if (filter & media::FILTER_BILINEAR_V) {
325 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
326 if (yscale_fixed != kFractionMax &&
327 source_y_fraction && ((source_y + 1) < source_height)) {
328 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
330 memcpy(ybuf, y0_ptr, source_width);
333 ybuf[source_width] = ybuf[source_width-1];
334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
337 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
338 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
340 if (source_dx == kFractionMax) { // Not scaled
341 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
344 - if (filter & FILTER_BILINEAR_H) {
345 + } else if (filter & FILTER_BILINEAR_H) {
346 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
347 dest_pixel, width, source_dx);
349 // Specialized scalers and rotation.
350 -#if USE_MMX && defined(_MSC_VER)
351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
352 + if(mozilla::supports_sse()) {
353 if (width == (source_width * 2)) {
354 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
355 - dest_pixel, width);
356 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
357 + dest_pixel, width);
358 } else if ((source_dx & kFractionMask) == 0) {
359 // Scaling by integer scale factor. ie half.
360 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
362 - source_dx >> kFractionBits);
363 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
365 + source_dx >> kFractionBits);
366 } else if (source_dx_uv == source_dx) { // Not rotated.
367 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
368 dest_pixel, width, source_dx);
370 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
372 - source_dx >> kFractionBits,
373 - source_dx_uv >> kFractionBits);
374 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
376 + source_dx >> kFractionBits,
377 + source_dx_uv >> kFractionBits);
381 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
382 + dest_pixel, width, source_dx);
385 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
386 - dest_pixel, width, source_dx);
389 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
390 + dest_pixel, width, source_dx);
394 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
398 -} // namespace media
404 +} // namespace mozilla
405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
406 --- a/gfx/ycbcr/yuv_convert.h
407 +++ b/gfx/ycbcr/yuv_convert.h
409 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
410 // Use of this source code is governed by a BSD-style license that can be
411 // found in the LICENSE file.
413 #ifndef MEDIA_BASE_YUV_CONVERT_H_
414 #define MEDIA_BASE_YUV_CONVERT_H_
416 -#include "base/basictypes.h"
420 +#include "chromium_types.h"
421 +#include "gfxCore.h"
427 // Type of YUV surface.
428 // The value of these enums matter as they are used to shift vertical indices.
430 - YV16 = 0, // YV16 is half width and full height chroma channels.
431 - YV12 = 1, // YV12 is half width and half height chroma channels.
432 + YV12 = 0, // YV12 is half width and half height chroma channels.
433 + YV16 = 1, // YV16 is half width and full height chroma channels.
434 + YV24 = 2 // YV24 is full width and full height chroma channels.
437 // Mirror means flip the image horizontally, as in looking in a mirror.
438 // Rotate happens after mirroring.
440 ROTATE_0, // Rotation off.
441 ROTATE_90, // Rotate clockwise.
442 ROTATE_180, // Rotate upside down.
443 ROTATE_270, // Rotate counter clockwise.
444 MIRROR_ROTATE_0, // Mirror horizontally.
445 MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
446 MIRROR_ROTATE_180, // Mirror vertically.
447 - MIRROR_ROTATE_270, // Transpose.
448 + MIRROR_ROTATE_270 // Transpose.
451 // Filter affects how scaling looks.
453 FILTER_NONE = 0, // No filter (point sampled).
454 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
455 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
456 - FILTER_BILINEAR = 3, // Bilinear filter.
457 + FILTER_BILINEAR = 3 // Bilinear filter.
460 // Convert a frame of YUV to 32 bit ARGB.
461 // Pass in YV16/YV12 depending on source format
462 -void ConvertYUVToRGB32(const uint8* yplane,
463 - const uint8* uplane,
464 - const uint8* vplane,
472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
473 + const uint8* uplane,
474 + const uint8* vplane,
485 // Scale a frame of YUV to 32 bit ARGB.
486 // Supports rotation and mirroring.
487 -void ScaleYUVToRGB32(const uint8* yplane,
488 - const uint8* uplane,
489 - const uint8* vplane,
499 - Rotate view_rotate,
500 - ScaleFilter filter);
502 -} // namespace media
504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
505 + const uint8* uplane,
506 + const uint8* vplane,
516 + Rotate view_rotate,
517 + ScaleFilter filter);
520 +} // namespace mozilla
522 #endif // MEDIA_BASE_YUV_CONVERT_H_
523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
529 +// Use of this source code is governed by a BSD-style license that can be
530 +// found in the LICENSE file.
532 +#include <mmintrin.h>
533 +#include "yuv_row.h"
538 +// FilterRows combines two rows of the image using linear interpolation.
539 +// MMX version does 8 pixels at a time.
540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
541 + int source_width, int source_y_fraction) {
542 + __m64 zero = _mm_setzero_si64();
543 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
544 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
546 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
547 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
548 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
549 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
552 + __m64 y0 = *y0_ptr64++;
553 + __m64 y1 = *y1_ptr64++;
554 + __m64 y2 = _mm_unpackhi_pi8(y0, zero);
555 + __m64 y3 = _mm_unpackhi_pi8(y1, zero);
556 + y0 = _mm_unpacklo_pi8(y0, zero);
557 + y1 = _mm_unpacklo_pi8(y1, zero);
558 + y0 = _mm_mullo_pi16(y0, y0_fraction);
559 + y1 = _mm_mullo_pi16(y1, y1_fraction);
560 + y2 = _mm_mullo_pi16(y2, y0_fraction);
561 + y3 = _mm_mullo_pi16(y3, y1_fraction);
562 + y0 = _mm_add_pi16(y0, y1);
563 + y2 = _mm_add_pi16(y2, y3);
564 + y0 = _mm_srli_pi16(y0, 8);
565 + y2 = _mm_srli_pi16(y2, 8);
566 + y0 = _mm_packs_pu16(y0, y2);
568 + } while (dest64 < end64);
573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
579 +// Use of this source code is governed by a BSD-style license that can be
580 +// found in the LICENSE file.
582 +#include <emmintrin.h>
583 +#include "yuv_row.h"
588 +// FilterRows combines two rows of the image using linear interpolation.
589 +// SSE2 version does 16 pixels at a time.
590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
591 + int source_width, int source_y_fraction) {
592 + __m128i zero = _mm_setzero_si128();
593 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
594 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
596 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
597 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
598 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
599 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
602 + __m128i y0 = _mm_loadu_si128(y0_ptr128);
603 + __m128i y1 = _mm_loadu_si128(y1_ptr128);
604 + __m128i y2 = _mm_unpackhi_epi8(y0, zero);
605 + __m128i y3 = _mm_unpackhi_epi8(y1, zero);
606 + y0 = _mm_unpacklo_epi8(y0, zero);
607 + y1 = _mm_unpacklo_epi8(y1, zero);
608 + y0 = _mm_mullo_epi16(y0, y0_fraction);
609 + y1 = _mm_mullo_epi16(y1, y1_fraction);
610 + y2 = _mm_mullo_epi16(y2, y0_fraction);
611 + y3 = _mm_mullo_epi16(y3, y1_fraction);
612 + y0 = _mm_add_epi16(y0, y1);
613 + y2 = _mm_add_epi16(y2, y3);
614 + y0 = _mm_srli_epi16(y0, 8);
615 + y2 = _mm_srli_epi16(y2, 8);
616 + y0 = _mm_packus_epi16(y0, y2);
620 + } while (dest128 < end128);
625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
626 --- a/gfx/ycbcr/yuv_row.h
627 +++ b/gfx/ycbcr/yuv_row.h
629 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
630 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
632 // TODO(fbarchard): Write function that can handle rotation and scaling.
634 #ifndef MEDIA_BASE_YUV_ROW_H_
635 #define MEDIA_BASE_YUV_ROW_H_
637 -#include "base/basictypes.h"
638 +#include "chromium_types.h"
642 // This is the second fastest of the scalers.
643 void FastConvertYUVToRGB32Row(const uint8* y_buf,
649 -// Can do 1x, half size or any scale down by an integer amount.
650 -// Step can be negative (mirroring, rotate 180).
651 -// This is the third fastest of the scalers.
652 -void ConvertYUVToRGB32Row(const uint8* y_buf,
653 - const uint8* u_buf,
654 - const uint8* v_buf,
659 -// Rotate is like Convert, but applies different step to Y versus U and V.
660 -// This allows rotation by 90 or 270, by stepping by stride.
661 -// This is the forth fastest of the scalers.
662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
670 + unsigned int x_shift);
672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
673 + const uint8* u_buf,
674 + const uint8* v_buf,
678 +// Can do 1x, half size or any scale down by an integer amount.
679 +// Step can be negative (mirroring, rotate 180).
680 +// This is the third fastest of the scalers.
681 +// Only defined on Windows x86-32.
682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
683 + const uint8* u_buf,
684 + const uint8* v_buf,
689 +// Rotate is like Convert, but applies different step to Y versus U and V.
690 +// This allows rotation by 90 or 270, by stepping by stride.
691 +// This is the forth fastest of the scalers.
692 +// Only defined on Windows x86-32.
693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
694 + const uint8* u_buf,
695 + const uint8* v_buf,
701 // Doubler does 4 pixels at a time. Each pixel is replicated.
702 // This is the fastest of the scalers.
703 -void DoubleYUVToRGB32Row(const uint8* y_buf,
704 - const uint8* u_buf,
705 - const uint8* v_buf,
708 +// Only defined on Windows x86-32.
709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
710 + const uint8* u_buf,
711 + const uint8* v_buf,
715 // Handles arbitrary scaling up or down.
716 // Mirroring is supported, but not 90 or 270 degree rotation.
717 // Chroma is under sampled every 2 pixels for performance.
718 void ScaleYUVToRGB32Row(const uint8* y_buf,
725 +void ScaleYUVToRGB32Row(const uint8* y_buf,
726 + const uint8* u_buf,
727 + const uint8* v_buf,
732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
733 + const uint8* u_buf,
734 + const uint8* v_buf,
739 // Handles arbitrary scaling up or down with bilinear filtering.
740 // Mirroring is supported, but not 90 or 270 degree rotation.
741 // Chroma is under sampled every 2 pixels for performance.
742 // This is the slowest of the scalers.
743 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
751 + const uint8* u_buf,
752 + const uint8* v_buf,
757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
758 + const uint8* u_buf,
759 + const uint8* v_buf,
765 #if defined(_MSC_VER)
766 #define SIMD_ALIGNED(var) __declspec(align(16)) var
768 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
770 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
772 -// Method to force C version.
774 -//#define USE_SSE2 0
776 -#if !defined(USE_MMX)
777 -// Windows, Mac and Linux/BSD use MMX
778 -#if defined(__MMX__) || defined(_MSC_VER)
785 -#if !defined(USE_SSE2)
786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
793 // x64 uses MMX2 (SSE) so emms is not required.
794 // Warning C4799: function has no EMMS instruction.
795 // EMMS() is slow and should be called by the calling function once per image.
796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
798 #if defined(_MSC_VER)
799 #define EMMS() __asm emms
800 #pragma warning(disable: 4799)
802 #define EMMS() asm("emms")
806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
807 --- a/gfx/ycbcr/yuv_row_c.cpp
808 +++ b/gfx/ycbcr/yuv_row_c.cpp
810 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
811 // Use of this source code is governed by a BSD-style license that can be
812 // found in the LICENSE file.
814 -#include "media/base/yuv_row.h"
817 -#include "base/logging.h"
819 +#include "yuv_row.h"
826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
828 -// AMD64 ABI uses register paremters.
829 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
830 - const uint8* u_buf, // rsi
831 - const uint8* v_buf, // rdx
832 - uint8* rgb_buf, // rcx
837 - "movzb (%1),%%r10\n"
839 - "movzb (%2),%%r11\n"
841 - "movq 2048(%5,%%r10,8),%%xmm0\n"
842 - "movzb (%0),%%r10\n"
843 - "movq 4096(%5,%%r11,8),%%xmm1\n"
844 - "movzb 0x1(%0),%%r11\n"
845 - "paddsw %%xmm1,%%xmm0\n"
846 - "movq (%5,%%r10,8),%%xmm2\n"
848 - "movq (%5,%%r11,8),%%xmm3\n"
849 - "paddsw %%xmm0,%%xmm2\n"
850 - "paddsw %%xmm0,%%xmm3\n"
851 - "shufps $0x44,%%xmm3,%%xmm2\n"
852 - "psraw $0x6,%%xmm2\n"
853 - "packuswb %%xmm2,%%xmm2\n"
854 - "movq %%xmm2,0x0(%3)\n"
858 - "jns convertloop\n"
864 - "movzb (%1),%%r10\n"
865 - "movq 2048(%5,%%r10,8),%%xmm0\n"
866 - "movzb (%2),%%r10\n"
867 - "movq 4096(%5,%%r10,8),%%xmm1\n"
868 - "paddsw %%xmm1,%%xmm0\n"
869 - "movzb (%0),%%r10\n"
870 - "movq (%5,%%r10,8),%%xmm1\n"
871 - "paddsw %%xmm0,%%xmm1\n"
872 - "psraw $0x6,%%xmm1\n"
873 - "packuswb %%xmm1,%%xmm1\n"
874 - "movd %%xmm1,0x0(%3)\n"
877 - : "r"(y_buf), // %0
880 - "r"(rgb_buf), // %3
882 - "r" (kCoefficientsRgbY) // %5
883 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
887 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
888 - const uint8* u_buf, // rsi
889 - const uint8* v_buf, // rdx
890 - uint8* rgb_buf, // rcx
892 - int source_dx) { // r9
894 - "xor %%r11,%%r11\n"
899 - "mov %%r11,%%r10\n"
900 - "sar $0x11,%%r10\n"
901 - "movzb (%1,%%r10,1),%%rax\n"
902 - "movq 2048(%5,%%rax,8),%%xmm0\n"
903 - "movzb (%2,%%r10,1),%%rax\n"
904 - "movq 4096(%5,%%rax,8),%%xmm1\n"
905 - "lea (%%r11,%6),%%r10\n"
906 - "sar $0x10,%%r11\n"
907 - "movzb (%0,%%r11,1),%%rax\n"
908 - "paddsw %%xmm1,%%xmm0\n"
909 - "movq (%5,%%rax,8),%%xmm1\n"
910 - "lea (%%r10,%6),%%r11\n"
911 - "sar $0x10,%%r10\n"
912 - "movzb (%0,%%r10,1),%%rax\n"
913 - "movq (%5,%%rax,8),%%xmm2\n"
914 - "paddsw %%xmm0,%%xmm1\n"
915 - "paddsw %%xmm0,%%xmm2\n"
916 - "shufps $0x44,%%xmm2,%%xmm1\n"
917 - "psraw $0x6,%%xmm1\n"
918 - "packuswb %%xmm1,%%xmm1\n"
919 - "movq %%xmm1,0x0(%3)\n"
928 - "mov %%r11,%%r10\n"
929 - "sar $0x11,%%r10\n"
930 - "movzb (%1,%%r10,1),%%rax\n"
931 - "movq 2048(%5,%%rax,8),%%xmm0\n"
932 - "movzb (%2,%%r10,1),%%rax\n"
933 - "movq 4096(%5,%%rax,8),%%xmm1\n"
934 - "paddsw %%xmm1,%%xmm0\n"
935 - "sar $0x10,%%r11\n"
936 - "movzb (%0,%%r11,1),%%rax\n"
937 - "movq (%5,%%rax,8),%%xmm1\n"
938 - "paddsw %%xmm0,%%xmm1\n"
939 - "psraw $0x6,%%xmm1\n"
940 - "packuswb %%xmm1,%%xmm1\n"
941 - "movd %%xmm1,0x0(%3)\n"
945 - : "r"(y_buf), // %0
948 - "r"(rgb_buf), // %3
950 - "r" (kCoefficientsRgbY), // %5
951 - "r"(static_cast<long>(source_dx)) // %6
952 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
957 - const uint8* u_buf,
958 - const uint8* v_buf,
963 - "xor %%r11,%%r11\n" // x = 0
966 - "cmp $0x20000,%6\n" // if source_dx >= 2.0
968 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
972 - "mov %%r11,%%r10\n"
973 - "sar $0x11,%%r10\n"
975 - "movzb (%1, %%r10, 1), %%r13 \n"
976 - "movzb 1(%1, %%r10, 1), %%r14 \n"
977 - "mov %%r11, %%rax \n"
978 - "and $0x1fffe, %%rax \n"
979 - "imul %%rax, %%r14 \n"
980 - "xor $0x1fffe, %%rax \n"
981 - "imul %%rax, %%r13 \n"
982 - "add %%r14, %%r13 \n"
983 - "shr $17, %%r13 \n"
984 - "movq 2048(%5,%%r13,8), %%xmm0\n"
986 - "movzb (%2, %%r10, 1), %%r13 \n"
987 - "movzb 1(%2, %%r10, 1), %%r14 \n"
988 - "mov %%r11, %%rax \n"
989 - "and $0x1fffe, %%rax \n"
990 - "imul %%rax, %%r14 \n"
991 - "xor $0x1fffe, %%rax \n"
992 - "imul %%rax, %%r13 \n"
993 - "add %%r14, %%r13 \n"
994 - "shr $17, %%r13 \n"
995 - "movq 4096(%5,%%r13,8), %%xmm1\n"
997 - "mov %%r11, %%rax \n"
998 - "lea (%%r11,%6),%%r10\n"
999 - "sar $0x10,%%r11\n"
1000 - "paddsw %%xmm1,%%xmm0\n"
1002 - "movzb (%0, %%r11, 1), %%r13 \n"
1003 - "movzb 1(%0, %%r11, 1), %%r14 \n"
1004 - "and $0xffff, %%rax \n"
1005 - "imul %%rax, %%r14 \n"
1006 - "xor $0xffff, %%rax \n"
1007 - "imul %%rax, %%r13 \n"
1008 - "add %%r14, %%r13 \n"
1009 - "shr $16, %%r13 \n"
1010 - "movq (%5,%%r13,8),%%xmm1\n"
1012 - "mov %%r10, %%rax \n"
1013 - "lea (%%r10,%6),%%r11\n"
1014 - "sar $0x10,%%r10\n"
1016 - "movzb (%0,%%r10,1), %%r13 \n"
1017 - "movzb 1(%0,%%r10,1), %%r14 \n"
1018 - "and $0xffff, %%rax \n"
1019 - "imul %%rax, %%r14 \n"
1020 - "xor $0xffff, %%rax \n"
1021 - "imul %%rax, %%r13 \n"
1022 - "add %%r14, %%r13 \n"
1023 - "shr $16, %%r13 \n"
1024 - "movq (%5,%%r13,8),%%xmm2\n"
1026 - "paddsw %%xmm0,%%xmm1\n"
1027 - "paddsw %%xmm0,%%xmm2\n"
1028 - "shufps $0x44,%%xmm2,%%xmm1\n"
1029 - "psraw $0x6,%%xmm1\n"
1030 - "packuswb %%xmm1,%%xmm1\n"
1031 - "movq %%xmm1,0x0(%3)\n"
1034 - "jns .lscaleloop\n"
1038 - "js .lscaledone\n"
1040 - "mov %%r11,%%r10\n"
1041 - "sar $0x11,%%r10\n"
1043 - "movzb (%1,%%r10,1), %%r13 \n"
1044 - "movq 2048(%5,%%r13,8),%%xmm0\n"
1046 - "movzb (%2,%%r10,1), %%r13 \n"
1047 - "movq 4096(%5,%%r13,8),%%xmm1\n"
1049 - "paddsw %%xmm1,%%xmm0\n"
1050 - "sar $0x10,%%r11\n"
1052 - "movzb (%0,%%r11,1), %%r13 \n"
1053 - "movq (%5,%%r13,8),%%xmm1\n"
1055 - "paddsw %%xmm0,%%xmm1\n"
1056 - "psraw $0x6,%%xmm1\n"
1057 - "packuswb %%xmm1,%%xmm1\n"
1058 - "movd %%xmm1,0x0(%3)\n"
1062 - : "r"(y_buf), // %0
1065 - "r"(rgb_buf), // %3
1067 - "r" (kCoefficientsRgbY), // %5
1068 - "r"(static_cast<long>(source_dx)) // %6
1069 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1075 -// PIC version is slower because less registers are available, so
1076 -// non-PIC is used on platforms where it is possible.
1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1079 - const uint8* u_buf,
1080 - const uint8* v_buf,
1085 - ".global FastConvertYUVToRGB32Row\n"
1086 -"FastConvertYUVToRGB32Row:\n"
1088 - "mov 0x24(%esp),%edx\n"
1089 - "mov 0x28(%esp),%edi\n"
1090 - "mov 0x2c(%esp),%esi\n"
1091 - "mov 0x30(%esp),%ebp\n"
1092 - "mov 0x34(%esp),%ecx\n"
1093 - "jmp convertend\n"
1096 - "movzbl (%edi),%eax\n"
1098 - "movzbl (%esi),%ebx\n"
1100 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1101 - "movzbl (%edx),%eax\n"
1102 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1103 - "movzbl 0x1(%edx),%ebx\n"
1104 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1106 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
1107 - "paddsw %mm0,%mm1\n"
1108 - "paddsw %mm0,%mm2\n"
1109 - "psraw $0x6,%mm1\n"
1110 - "psraw $0x6,%mm2\n"
1111 - "packuswb %mm2,%mm1\n"
1112 - "movntq %mm1,0x0(%ebp)\n"
1116 - "jns convertloop\n"
1119 - "je convertdone\n"
1121 - "movzbl (%edi),%eax\n"
1122 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1123 - "movzbl (%esi),%eax\n"
1124 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1125 - "movzbl (%edx),%eax\n"
1126 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1127 - "paddsw %mm0,%mm1\n"
1128 - "psraw $0x6,%mm1\n"
1129 - "packuswb %mm1,%mm1\n"
1130 - "movd %mm1,0x0(%ebp)\n"
1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1138 - const uint8* u_buf,
1139 - const uint8* v_buf,
1145 - ".global ScaleYUVToRGB32Row\n"
1146 -"ScaleYUVToRGB32Row:\n"
1148 - "mov 0x24(%esp),%edx\n"
1149 - "mov 0x28(%esp),%edi\n"
1150 - "mov 0x2c(%esp),%esi\n"
1151 - "mov 0x30(%esp),%ebp\n"
1152 - "mov 0x34(%esp),%ecx\n"
1158 - "sar $0x11,%eax\n"
1159 - "movzbl (%edi,%eax,1),%eax\n"
1160 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1162 - "sar $0x11,%eax\n"
1163 - "movzbl (%esi,%eax,1),%eax\n"
1164 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1166 - "add 0x38(%esp),%ebx\n"
1167 - "sar $0x10,%eax\n"
1168 - "movzbl (%edx,%eax,1),%eax\n"
1169 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1171 - "add 0x38(%esp),%ebx\n"
1172 - "sar $0x10,%eax\n"
1173 - "movzbl (%edx,%eax,1),%eax\n"
1174 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
1175 - "paddsw %mm0,%mm1\n"
1176 - "paddsw %mm0,%mm2\n"
1177 - "psraw $0x6,%mm1\n"
1178 - "psraw $0x6,%mm2\n"
1179 - "packuswb %mm2,%mm1\n"
1180 - "movntq %mm1,0x0(%ebp)\n"
1190 - "sar $0x11,%eax\n"
1191 - "movzbl (%edi,%eax,1),%eax\n"
1192 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1194 - "sar $0x11,%eax\n"
1195 - "movzbl (%esi,%eax,1),%eax\n"
1196 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1198 - "sar $0x10,%eax\n"
1199 - "movzbl (%edx,%eax,1),%eax\n"
1200 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1201 - "paddsw %mm0,%mm1\n"
1202 - "psraw $0x6,%mm1\n"
1203 - "packuswb %mm1,%mm1\n"
1204 - "movd %mm1,0x0(%ebp)\n"
1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1212 - const uint8* u_buf,
1213 - const uint8* v_buf,
1219 - ".global LinearScaleYUVToRGB32Row\n"
1220 -"LinearScaleYUVToRGB32Row:\n"
1222 - "mov 0x24(%esp),%edx\n"
1223 - "mov 0x28(%esp),%edi\n"
1224 - "mov 0x30(%esp),%ebp\n"
1226 - // source_width = width * source_dx + ebx
1227 - "mov 0x34(%esp), %ecx\n"
1228 - "imull 0x38(%esp), %ecx\n"
1229 - "mov %ecx, 0x34(%esp)\n"
1231 - "mov 0x38(%esp), %ecx\n"
1232 - "xor %ebx,%ebx\n" // x = 0
1233 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1235 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1236 - "jmp .lscaleend\n"
1240 - "sar $0x11,%eax\n"
1242 - "movzbl (%edi,%eax,1),%ecx\n"
1243 - "movzbl 1(%edi,%eax,1),%esi\n"
1245 - "andl $0x1fffe, %eax \n"
1246 - "imul %eax, %esi \n"
1247 - "xorl $0x1fffe, %eax \n"
1248 - "imul %eax, %ecx \n"
1249 - "addl %esi, %ecx \n"
1250 - "shrl $17, %ecx \n"
1251 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1253 - "mov 0x2c(%esp),%esi\n"
1255 - "sar $0x11,%eax\n"
1257 - "movzbl (%esi,%eax,1),%ecx\n"
1258 - "movzbl 1(%esi,%eax,1),%esi\n"
1260 - "andl $0x1fffe, %eax \n"
1261 - "imul %eax, %esi \n"
1262 - "xorl $0x1fffe, %eax \n"
1263 - "imul %eax, %ecx \n"
1264 - "addl %esi, %ecx \n"
1265 - "shrl $17, %ecx \n"
1266 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1269 - "sar $0x10,%eax\n"
1270 - "movzbl (%edx,%eax,1),%ecx\n"
1271 - "movzbl 1(%edx,%eax,1),%esi\n"
1273 - "add 0x38(%esp),%ebx\n"
1274 - "andl $0xffff, %eax \n"
1275 - "imul %eax, %esi \n"
1276 - "xorl $0xffff, %eax \n"
1277 - "imul %eax, %ecx \n"
1278 - "addl %esi, %ecx \n"
1279 - "shrl $16, %ecx \n"
1280 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
1282 - "cmp 0x34(%esp), %ebx\n"
1283 - "jge .lscalelastpixel\n"
1286 - "sar $0x10,%eax\n"
1287 - "movzbl (%edx,%eax,1),%ecx\n"
1288 - "movzbl 1(%edx,%eax,1),%esi\n"
1290 - "add 0x38(%esp),%ebx\n"
1291 - "andl $0xffff, %eax \n"
1292 - "imul %eax, %esi \n"
1293 - "xorl $0xffff, %eax \n"
1294 - "imul %eax, %ecx \n"
1295 - "addl %esi, %ecx \n"
1296 - "shrl $16, %ecx \n"
1297 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
1299 - "paddsw %mm0,%mm1\n"
1300 - "paddsw %mm0,%mm2\n"
1301 - "psraw $0x6,%mm1\n"
1302 - "psraw $0x6,%mm2\n"
1303 - "packuswb %mm2,%mm1\n"
1304 - "movntq %mm1,0x0(%ebp)\n"
1308 - "cmp 0x34(%esp), %ebx\n"
1309 - "jl .lscaleloop\n"
1313 -".lscalelastpixel:"
1314 - "paddsw %mm0, %mm1\n"
1315 - "psraw $6, %mm1\n"
1316 - "packuswb %mm1, %mm1\n"
1317 - "movd %mm1, (%ebp)\n"
1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1325 - const uint8* u_buf,
1326 - const uint8* v_buf,
1329 - int16 *kCoefficientsRgbY);
1332 -#if defined(OS_MACOSX)
1333 -"_PICConvertYUVToRGB32Row:\n"
1335 -"PICConvertYUVToRGB32Row:\n"
1338 - "mov 0x24(%esp),%edx\n"
1339 - "mov 0x28(%esp),%edi\n"
1340 - "mov 0x2c(%esp),%esi\n"
1341 - "mov 0x30(%esp),%ebp\n"
1342 - "mov 0x38(%esp),%ecx\n"
1344 - "jmp .Lconvertend\n"
1347 - "movzbl (%edi),%eax\n"
1349 - "movzbl (%esi),%ebx\n"
1351 - "movq 2048(%ecx,%eax,8),%mm0\n"
1352 - "movzbl (%edx),%eax\n"
1353 - "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1354 - "movzbl 0x1(%edx),%ebx\n"
1355 - "movq 0(%ecx,%eax,8),%mm1\n"
1357 - "movq 0(%ecx,%ebx,8),%mm2\n"
1358 - "paddsw %mm0,%mm1\n"
1359 - "paddsw %mm0,%mm2\n"
1360 - "psraw $0x6,%mm1\n"
1361 - "psraw $0x6,%mm2\n"
1362 - "packuswb %mm2,%mm1\n"
1363 - "movntq %mm1,0x0(%ebp)\n"
1366 - "subl $0x2,0x34(%esp)\n"
1367 - "jns .Lconvertloop\n"
1369 - "andl $0x1,0x34(%esp)\n"
1370 - "je .Lconvertdone\n"
1372 - "movzbl (%edi),%eax\n"
1373 - "movq 2048(%ecx,%eax,8),%mm0\n"
1374 - "movzbl (%esi),%eax\n"
1375 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1376 - "movzbl (%edx),%eax\n"
1377 - "movq 0(%ecx,%eax,8),%mm1\n"
1378 - "paddsw %mm0,%mm1\n"
1379 - "psraw $0x6,%mm1\n"
1380 - "packuswb %mm1,%mm1\n"
1381 - "movd %mm1,0x0(%ebp)\n"
1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1388 - const uint8* u_buf,
1389 - const uint8* v_buf,
1392 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1393 - &kCoefficientsRgbY[0][0]);
1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1397 - const uint8* u_buf,
1398 - const uint8* v_buf,
1402 - int16 *kCoefficientsRgbY);
1406 -#if defined(OS_MACOSX)
1407 -"_PICScaleYUVToRGB32Row:\n"
1409 -"PICScaleYUVToRGB32Row:\n"
1412 - "mov 0x24(%esp),%edx\n"
1413 - "mov 0x28(%esp),%edi\n"
1414 - "mov 0x2c(%esp),%esi\n"
1415 - "mov 0x30(%esp),%ebp\n"
1416 - "mov 0x3c(%esp),%ecx\n"
1422 - "sar $0x11,%eax\n"
1423 - "movzbl (%edi,%eax,1),%eax\n"
1424 - "movq 2048(%ecx,%eax,8),%mm0\n"
1426 - "sar $0x11,%eax\n"
1427 - "movzbl (%esi,%eax,1),%eax\n"
1428 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1430 - "add 0x38(%esp),%ebx\n"
1431 - "sar $0x10,%eax\n"
1432 - "movzbl (%edx,%eax,1),%eax\n"
1433 - "movq 0(%ecx,%eax,8),%mm1\n"
1435 - "add 0x38(%esp),%ebx\n"
1436 - "sar $0x10,%eax\n"
1437 - "movzbl (%edx,%eax,1),%eax\n"
1438 - "movq 0(%ecx,%eax,8),%mm2\n"
1439 - "paddsw %mm0,%mm1\n"
1440 - "paddsw %mm0,%mm2\n"
1441 - "psraw $0x6,%mm1\n"
1442 - "psraw $0x6,%mm2\n"
1443 - "packuswb %mm2,%mm1\n"
1444 - "movntq %mm1,0x0(%ebp)\n"
1447 - "subl $0x2,0x34(%esp)\n"
1448 - "jns Lscaleloop\n"
1450 - "andl $0x1,0x34(%esp)\n"
1454 - "sar $0x11,%eax\n"
1455 - "movzbl (%edi,%eax,1),%eax\n"
1456 - "movq 2048(%ecx,%eax,8),%mm0\n"
1458 - "sar $0x11,%eax\n"
1459 - "movzbl (%esi,%eax,1),%eax\n"
1460 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1462 - "sar $0x10,%eax\n"
1463 - "movzbl (%edx,%eax,1),%eax\n"
1464 - "movq 0(%ecx,%eax,8),%mm1\n"
1465 - "paddsw %mm0,%mm1\n"
1466 - "psraw $0x6,%mm1\n"
1467 - "packuswb %mm1,%mm1\n"
1468 - "movd %mm1,0x0(%ebp)\n"
1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1477 - const uint8* u_buf,
1478 - const uint8* v_buf,
1482 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1483 - &kCoefficientsRgbY[0][0]);
1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1487 - const uint8* u_buf,
1488 - const uint8* v_buf,
1492 - int16 *kCoefficientsRgbY);
1495 -#if defined(OS_MACOSX)
1496 -"_PICLinearScaleYUVToRGB32Row:\n"
1498 -"PICLinearScaleYUVToRGB32Row:\n"
1501 - "mov 0x24(%esp),%edx\n"
1502 - "mov 0x30(%esp),%ebp\n"
1503 - "mov 0x34(%esp),%ecx\n"
1504 - "mov 0x3c(%esp),%edi\n"
1507 - // source_width = width * source_dx + ebx
1508 - "mov 0x34(%esp), %ecx\n"
1509 - "imull 0x38(%esp), %ecx\n"
1510 - "mov %ecx, 0x34(%esp)\n"
1512 - "mov 0x38(%esp), %ecx\n"
1513 - "xor %ebx,%ebx\n" // x = 0
1514 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1516 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1517 - "jmp .lscaleend\n"
1520 - "mov 0x28(%esp),%esi\n"
1522 - "sar $0x11,%eax\n"
1524 - "movzbl (%esi,%eax,1),%ecx\n"
1525 - "movzbl 1(%esi,%eax,1),%esi\n"
1527 - "andl $0x1fffe, %eax \n"
1528 - "imul %eax, %esi \n"
1529 - "xorl $0x1fffe, %eax \n"
1530 - "imul %eax, %ecx \n"
1531 - "addl %esi, %ecx \n"
1532 - "shrl $17, %ecx \n"
1533 - "movq 2048(%edi,%ecx,8),%mm0\n"
1535 - "mov 0x2c(%esp),%esi\n"
1537 - "sar $0x11,%eax\n"
1539 - "movzbl (%esi,%eax,1),%ecx\n"
1540 - "movzbl 1(%esi,%eax,1),%esi\n"
1542 - "andl $0x1fffe, %eax \n"
1543 - "imul %eax, %esi \n"
1544 - "xorl $0x1fffe, %eax \n"
1545 - "imul %eax, %ecx \n"
1546 - "addl %esi, %ecx \n"
1547 - "shrl $17, %ecx \n"
1548 - "paddsw 4096(%edi,%ecx,8),%mm0\n"
1551 - "sar $0x10,%eax\n"
1552 - "movzbl (%edx,%eax,1),%ecx\n"
1553 - "movzbl 1(%edx,%eax,1),%esi\n"
1555 - "add 0x38(%esp),%ebx\n"
1556 - "andl $0xffff, %eax \n"
1557 - "imul %eax, %esi \n"
1558 - "xorl $0xffff, %eax \n"
1559 - "imul %eax, %ecx \n"
1560 - "addl %esi, %ecx \n"
1561 - "shrl $16, %ecx \n"
1562 - "movq (%edi,%ecx,8),%mm1\n"
1564 - "cmp 0x34(%esp), %ebx\n"
1565 - "jge .lscalelastpixel\n"
1568 - "sar $0x10,%eax\n"
1569 - "movzbl (%edx,%eax,1),%ecx\n"
1570 - "movzbl 1(%edx,%eax,1),%esi\n"
1572 - "add 0x38(%esp),%ebx\n"
1573 - "andl $0xffff, %eax \n"
1574 - "imul %eax, %esi \n"
1575 - "xorl $0xffff, %eax \n"
1576 - "imul %eax, %ecx \n"
1577 - "addl %esi, %ecx \n"
1578 - "shrl $16, %ecx \n"
1579 - "movq (%edi,%ecx,8),%mm2\n"
1581 - "paddsw %mm0,%mm1\n"
1582 - "paddsw %mm0,%mm2\n"
1583 - "psraw $0x6,%mm1\n"
1584 - "psraw $0x6,%mm2\n"
1585 - "packuswb %mm2,%mm1\n"
1586 - "movntq %mm1,0x0(%ebp)\n"
1590 - "cmp %ebx, 0x34(%esp)\n"
1591 - "jg .lscaleloop\n"
1595 -".lscalelastpixel:"
1596 - "paddsw %mm0, %mm1\n"
1597 - "psraw $6, %mm1\n"
1598 - "packuswb %mm1, %mm1\n"
1599 - "movd %mm1, (%ebp)\n"
1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1605 - const uint8* u_buf,
1606 - const uint8* v_buf,
1610 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1611 - &kCoefficientsRgbY[0][0]);
1616 // C reference code that mimic the YUV assembly.
1617 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1618 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1619 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1621 static inline void YuvPixel(uint8 y,
1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1627 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1628 (packuswb(g) << 8) |
1629 (packuswb(r) << 16) |
1630 (packuswb(a) << 24);
1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1634 - const uint8* u_buf,
1635 - const uint8* v_buf,
1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1639 + const uint8* u_buf,
1640 + const uint8* v_buf,
1643 + unsigned int x_shift) {
1644 for (int x = 0; x < width; x += 2) {
1645 - uint8 u = u_buf[x >> 1];
1646 - uint8 v = v_buf[x >> 1];
1647 + uint8 u = u_buf[x >> x_shift];
1648 + uint8 v = v_buf[x >> x_shift];
1649 uint8 y0 = y_buf[x];
1650 YuvPixel(y0, u, v, rgb_buf);
1651 if ((x + 1) < width) {
1652 uint8 y1 = y_buf[x + 1];
1653 + if (x_shift == 0) {
1657 YuvPixel(y1, u, v, rgb_buf + 4);
1659 rgb_buf += 8; // Advance 2 pixels.
1663 // 16.16 fixed point is used. A shift by 16 isolates the integer.
1664 // A shift by 17 is used to further subsample the chrominence channels.
1665 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
1666 // for 1/65536 pixel accurate interpolation.
1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1668 - const uint8* u_buf,
1669 - const uint8* v_buf,
1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1674 + const uint8* u_buf,
1675 + const uint8* v_buf,
1680 for (int i = 0; i < width; i += 2) {
1681 int y = y_buf[x >> 16];
1682 int u = u_buf[(x >> 17)];
1683 int v = v_buf[(x >> 17)];
1684 YuvPixel(y, u, v, rgb_buf);
1686 if ((i + 1) < width) {
1688 YuvPixel(y, u, v, rgb_buf+4);
1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1696 - const uint8* u_buf,
1697 - const uint8* v_buf,
1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1702 + const uint8* u_buf,
1703 + const uint8* v_buf,
1708 if (source_dx >= 0x20000) {
1711 for (int i = 0; i < width; i += 2) {
1712 int y0 = y_buf[x >> 16];
1713 int y1 = y_buf[(x >> 16) + 1];
1714 int u0 = u_buf[(x >> 17)];
1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1716 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1717 YuvPixel(y, u, v, rgb_buf+4);
1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1728 --- a/gfx/ycbcr/yuv_row_posix.cpp
1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1731 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1732 // Use of this source code is governed by a BSD-style license that can be
1733 // found in the LICENSE file.
1735 -#include "media/base/yuv_row.h"
1738 -#include "base/logging.h"
1740 +#include "yuv_row.h"
1741 +#include "mozilla/SSE.h"
1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1749 +#if defined(ARCH_CPU_X86_64)
1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1753 // AMD64 ABI uses register paremters.
1754 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
1755 const uint8* u_buf, // rsi
1756 const uint8* v_buf, // rdx
1757 uint8* rgb_buf, // rcx
1760 - "jmp convertend\n"
1764 "movzb (%1),%%r10\n"
1766 "movzb (%2),%%r11\n"
1768 "movq 2048(%5,%%r10,8),%%xmm0\n"
1769 "movzb (%0),%%r10\n"
1770 "movq 4096(%5,%%r11,8),%%xmm1\n"
1771 "movzb 0x1(%0),%%r11\n"
1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1773 "movq (%5,%%r11,8),%%xmm3\n"
1774 "paddsw %%xmm0,%%xmm2\n"
1775 "paddsw %%xmm0,%%xmm3\n"
1776 "shufps $0x44,%%xmm3,%%xmm2\n"
1777 "psraw $0x6,%%xmm2\n"
1778 "packuswb %%xmm2,%%xmm2\n"
1779 "movq %%xmm2,0x0(%3)\n"
1784 - "jns convertloop\n"
1791 - "js convertdone\n"
1794 "movzb (%1),%%r10\n"
1795 "movq 2048(%5,%%r10,8),%%xmm0\n"
1796 "movzb (%2),%%r10\n"
1797 "movq 4096(%5,%%r10,8),%%xmm1\n"
1798 "paddsw %%xmm1,%%xmm0\n"
1799 "movzb (%0),%%r10\n"
1800 "movq (%5,%%r10,8),%%xmm1\n"
1801 "paddsw %%xmm0,%%xmm1\n"
1802 "psraw $0x6,%%xmm1\n"
1803 "packuswb %%xmm1,%%xmm1\n"
1804 "movd %%xmm1,0x0(%3)\n"
1813 "r" (kCoefficientsRgbY) // %5
1814 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1816 const uint8* u_buf, // rsi
1817 const uint8* v_buf, // rdx
1818 uint8* rgb_buf, // rcx
1820 int source_dx) { // r9
1832 "movzb (%1,%%r10,1),%%rax\n"
1833 "movq 2048(%5,%%rax,8),%%xmm0\n"
1834 "movzb (%2,%%r10,1),%%rax\n"
1835 "movq 4096(%5,%%rax,8),%%xmm1\n"
1836 "lea (%%r11,%6),%%r10\n"
1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1839 "paddsw %%xmm0,%%xmm1\n"
1840 "paddsw %%xmm0,%%xmm2\n"
1841 "shufps $0x44,%%xmm2,%%xmm1\n"
1842 "psraw $0x6,%%xmm1\n"
1843 "packuswb %%xmm1,%%xmm1\n"
1844 "movq %%xmm1,0x0(%3)\n"
1859 "movzb (%1,%%r10,1),%%rax\n"
1860 "movq 2048(%5,%%rax,8),%%xmm0\n"
1861 "movzb (%2,%%r10,1),%%rax\n"
1862 "movq 4096(%5,%%rax,8),%%xmm1\n"
1863 "paddsw %%xmm1,%%xmm0\n"
1865 "movzb (%0,%%r11,1),%%rax\n"
1866 "movq (%5,%%rax,8),%%xmm1\n"
1867 "paddsw %%xmm0,%%xmm1\n"
1868 "psraw $0x6,%%xmm1\n"
1869 "packuswb %%xmm1,%%xmm1\n"
1870 "movd %%xmm1,0x0(%3)\n"
1880 "r" (kCoefficientsRgbY), // %5
1881 "r"(static_cast<long>(source_dx)) // %6
1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1889 "xor %%r11,%%r11\n" // x = 0
1891 - "js .lscalenext\n"
1893 "cmp $0x20000,%6\n" // if source_dx >= 2.0
1894 - "jl .lscalehalf\n"
1896 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1906 "movzb (%1, %%r10, 1), %%r13 \n"
1907 "movzb 1(%1, %%r10, 1), %%r14 \n"
1908 "mov %%r11, %%rax \n"
1909 "and $0x1fffe, %%rax \n"
1910 "imul %%rax, %%r14 \n"
1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1912 "paddsw %%xmm0,%%xmm1\n"
1913 "paddsw %%xmm0,%%xmm2\n"
1914 "shufps $0x44,%%xmm2,%%xmm1\n"
1915 "psraw $0x6,%%xmm1\n"
1916 "packuswb %%xmm1,%%xmm1\n"
1917 "movq %%xmm1,0x0(%3)\n"
1920 - "jns .lscaleloop\n"
1927 - "js .lscaledone\n"
1933 "movzb (%1,%%r10,1), %%r13 \n"
1934 "movq 2048(%5,%%r13,8),%%xmm0\n"
1936 "movzb (%2,%%r10,1), %%r13 \n"
1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
1938 "movzb (%0,%%r11,1), %%r13 \n"
1939 "movq (%5,%%r13,8),%%xmm1\n"
1941 "paddsw %%xmm0,%%xmm1\n"
1942 "psraw $0x6,%%xmm1\n"
1943 "packuswb %%xmm1,%%xmm1\n"
1944 "movd %%xmm1,0x0(%3)\n"
1954 "r" (kCoefficientsRgbY), // %5
1955 "r"(static_cast<long>(source_dx)) // %6
1956 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
1963 // PIC version is slower because less registers are available, so
1964 // non-PIC is used on platforms where it is possible.
1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1967 - const uint8* u_buf,
1968 - const uint8* v_buf,
1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
1972 + const uint8* u_buf,
1973 + const uint8* v_buf,
1978 - ".global FastConvertYUVToRGB32Row\n"
1979 -"FastConvertYUVToRGB32Row:\n"
1980 + ".global FastConvertYUVToRGB32Row_SSE\n"
1981 + ".type FastConvertYUVToRGB32Row_SSE, @function\n"
1982 +"FastConvertYUVToRGB32Row_SSE:\n"
1984 "mov 0x24(%esp),%edx\n"
1985 "mov 0x28(%esp),%edi\n"
1986 "mov 0x2c(%esp),%esi\n"
1987 "mov 0x30(%esp),%ebp\n"
1988 "mov 0x34(%esp),%ecx\n"
1989 - "jmp convertend\n"
1995 "movzbl (%edi),%eax\n"
1997 "movzbl (%esi),%ebx\n"
1999 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2000 "movzbl (%edx),%eax\n"
2001 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2002 "movzbl 0x1(%edx),%ebx\n"
2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2004 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
2005 "paddsw %mm0,%mm1\n"
2006 "paddsw %mm0,%mm2\n"
2009 "packuswb %mm2,%mm1\n"
2010 "movntq %mm1,0x0(%ebp)\n"
2015 - "jns convertloop\n"
2019 - "je convertdone\n"
2022 "movzbl (%edi),%eax\n"
2023 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2024 "movzbl (%esi),%eax\n"
2025 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2026 "movzbl (%edx),%eax\n"
2027 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2028 "paddsw %mm0,%mm1\n"
2030 "packuswb %mm1,%mm1\n"
2031 "movd %mm1,0x0(%ebp)\n"
2036 +#if !defined(XP_MACOSX)
2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2043 - const uint8* u_buf,
2044 - const uint8* v_buf,
2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2049 + const uint8* u_buf,
2050 + const uint8* v_buf,
2054 + if (mozilla::supports_sse()) {
2055 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2059 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2064 + const uint8* u_buf,
2065 + const uint8* v_buf,
2071 - ".global ScaleYUVToRGB32Row\n"
2072 -"ScaleYUVToRGB32Row:\n"
2073 + ".global ScaleYUVToRGB32Row_SSE\n"
2074 + ".type ScaleYUVToRGB32Row_SSE, @function\n"
2075 +"ScaleYUVToRGB32Row_SSE:\n"
2077 "mov 0x24(%esp),%edx\n"
2078 "mov 0x28(%esp),%edi\n"
2079 "mov 0x2c(%esp),%esi\n"
2080 "mov 0x30(%esp),%ebp\n"
2081 "mov 0x34(%esp),%ecx\n"
2091 "movzbl (%edi,%eax,1),%eax\n"
2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2095 "movzbl (%esi,%eax,1),%eax\n"
2096 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2098 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
2099 "paddsw %mm0,%mm1\n"
2100 "paddsw %mm0,%mm2\n"
2103 "packuswb %mm2,%mm1\n"
2104 "movntq %mm1,0x0(%ebp)\n"
2118 "movzbl (%edi,%eax,1),%eax\n"
2119 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2122 "movzbl (%esi,%eax,1),%eax\n"
2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2125 "movzbl (%edx,%eax,1),%eax\n"
2126 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2127 "paddsw %mm0,%mm1\n"
2129 "packuswb %mm1,%mm1\n"
2130 "movd %mm1,0x0(%ebp)\n"
2136 +#if !defined(XP_MACOSX)
2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2142 - const uint8* u_buf,
2143 - const uint8* v_buf,
2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2148 + const uint8* u_buf,
2149 + const uint8* v_buf,
2154 + if (mozilla::supports_sse()) {
2155 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2156 + width, source_dx);
2159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2160 + width, source_dx);
2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2164 + const uint8* u_buf,
2165 + const uint8* v_buf,
2171 - ".global LinearScaleYUVToRGB32Row\n"
2172 -"LinearScaleYUVToRGB32Row:\n"
2173 + ".global LinearScaleYUVToRGB32Row_SSE\n"
2174 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2175 +"LinearScaleYUVToRGB32Row_SSE:\n"
2177 "mov 0x24(%esp),%edx\n"
2178 "mov 0x28(%esp),%edi\n"
2179 "mov 0x30(%esp),%ebp\n"
2181 // source_width = width * source_dx + ebx
2182 "mov 0x34(%esp), %ecx\n"
2183 "imull 0x38(%esp), %ecx\n"
2184 "mov %ecx, 0x34(%esp)\n"
2186 "mov 0x38(%esp), %ecx\n"
2187 "xor %ebx,%ebx\n" // x = 0
2188 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2191 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2192 - "jmp .lscaleend\n"
2196 - "sar $0x11,%eax\n"
2201 + "sar $0x11,%eax\n"
2203 "movzbl (%edi,%eax,1),%ecx\n"
2204 "movzbl 1(%edi,%eax,1),%esi\n"
2206 "andl $0x1fffe, %eax \n"
2207 "imul %eax, %esi \n"
2208 "xorl $0x1fffe, %eax \n"
2209 "imul %eax, %ecx \n"
2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2211 "imul %eax, %esi \n"
2212 "xorl $0xffff, %eax \n"
2213 "imul %eax, %ecx \n"
2214 "addl %esi, %ecx \n"
2216 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
2218 "cmp 0x34(%esp), %ebx\n"
2219 - "jge .lscalelastpixel\n"
2224 "movzbl (%edx,%eax,1),%ecx\n"
2225 "movzbl 1(%edx,%eax,1),%esi\n"
2227 "add 0x38(%esp),%ebx\n"
2228 "andl $0xffff, %eax \n"
2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2230 "paddsw %mm0,%mm1\n"
2231 "paddsw %mm0,%mm2\n"
2234 "packuswb %mm2,%mm1\n"
2235 "movntq %mm1,0x0(%ebp)\n"
2240 "cmp 0x34(%esp), %ebx\n"
2241 - "jl .lscaleloop\n"
2246 -".lscalelastpixel:"
2248 "paddsw %mm0, %mm1\n"
2250 "packuswb %mm1, %mm1\n"
2251 "movd %mm1, (%ebp)\n"
2254 +#if !defined(XP_MACOSX)
2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2262 - const uint8* u_buf,
2263 - const uint8* v_buf,
2266 - int16 *kCoefficientsRgbY);
2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2268 + const uint8* u_buf,
2269 + const uint8* v_buf,
2274 + if (mozilla::supports_sse()) {
2275 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2276 + width, source_dx);
2279 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2280 + width, source_dx);
2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2286 + const uint8* u_buf,
2287 + const uint8* v_buf,
2290 + int16 *kCoefficientsRgbY);
2294 -#if defined(OS_MACOSX)
2295 -"_PICConvertYUVToRGB32Row:\n"
2296 +#if defined(XP_MACOSX)
2297 +"_PICConvertYUVToRGB32Row_SSE:\n"
2299 -"PICConvertYUVToRGB32Row:\n"
2300 +"PICConvertYUVToRGB32Row_SSE:\n"
2303 "mov 0x24(%esp),%edx\n"
2304 "mov 0x28(%esp),%edi\n"
2305 "mov 0x2c(%esp),%esi\n"
2306 "mov 0x30(%esp),%ebp\n"
2307 "mov 0x38(%esp),%ecx\n"
2309 - "jmp .Lconvertend\n"
2315 "movzbl (%edi),%eax\n"
2317 "movzbl (%esi),%ebx\n"
2319 "movq 2048(%ecx,%eax,8),%mm0\n"
2320 "movzbl (%edx),%eax\n"
2321 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2322 "movzbl 0x1(%edx),%ebx\n"
2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2324 "movq 0(%ecx,%ebx,8),%mm2\n"
2325 "paddsw %mm0,%mm1\n"
2326 "paddsw %mm0,%mm2\n"
2329 "packuswb %mm2,%mm1\n"
2330 "movntq %mm1,0x0(%ebp)\n"
2334 "subl $0x2,0x34(%esp)\n"
2335 - "jns .Lconvertloop\n"
2338 "andl $0x1,0x34(%esp)\n"
2339 - "je .Lconvertdone\n"
2342 "movzbl (%edi),%eax\n"
2343 "movq 2048(%ecx,%eax,8),%mm0\n"
2344 "movzbl (%esi),%eax\n"
2345 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2346 "movzbl (%edx),%eax\n"
2347 "movq 0(%ecx,%eax,8),%mm1\n"
2348 "paddsw %mm0,%mm1\n"
2350 "packuswb %mm1,%mm1\n"
2351 "movd %mm1,0x0(%ebp)\n"
2356 +#if !defined(XP_MACOSX)
2361 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2366 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2367 - &kCoefficientsRgbY[0][0]);
2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2373 + if (mozilla::supports_sse()) {
2374 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2375 + &kCoefficientsRgbY[0][0]);
2379 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2388 int16 *kCoefficientsRgbY);
2392 -#if defined(OS_MACOSX)
2393 -"_PICScaleYUVToRGB32Row:\n"
2394 +#if defined(XP_MACOSX)
2395 +"_PICScaleYUVToRGB32Row_SSE:\n"
2397 -"PICScaleYUVToRGB32Row:\n"
2398 +"PICScaleYUVToRGB32Row_SSE:\n"
2401 "mov 0x24(%esp),%edx\n"
2402 "mov 0x28(%esp),%edi\n"
2403 "mov 0x2c(%esp),%esi\n"
2404 "mov 0x30(%esp),%ebp\n"
2405 "mov 0x3c(%esp),%ecx\n"
2415 "movzbl (%edi,%eax,1),%eax\n"
2416 "movq 2048(%ecx,%eax,8),%mm0\n"
2419 "movzbl (%esi,%eax,1),%eax\n"
2420 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2422 "movq 0(%ecx,%eax,8),%mm2\n"
2423 "paddsw %mm0,%mm1\n"
2424 "paddsw %mm0,%mm2\n"
2427 "packuswb %mm2,%mm1\n"
2428 "movntq %mm1,0x0(%ebp)\n"
2432 "subl $0x2,0x34(%esp)\n"
2433 - "jns Lscaleloop\n"
2436 "andl $0x1,0x34(%esp)\n"
2442 "movzbl (%edi,%eax,1),%eax\n"
2443 "movq 2048(%ecx,%eax,8),%mm0\n"
2446 "movzbl (%esi,%eax,1),%eax\n"
2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2449 "movzbl (%edx,%eax,1),%eax\n"
2450 "movq 0(%ecx,%eax,8),%mm1\n"
2451 "paddsw %mm0,%mm1\n"
2453 "packuswb %mm1,%mm1\n"
2454 "movd %mm1,0x0(%ebp)\n"
2460 +#if !defined(XP_MACOSX)
2466 void ScaleYUVToRGB32Row(const uint8* y_buf,
2472 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2473 - &kCoefficientsRgbY[0][0]);
2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2477 - const uint8* u_buf,
2478 - const uint8* v_buf,
2482 - int16 *kCoefficientsRgbY);
2485 + if (mozilla::supports_sse()) {
2486 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2487 + &kCoefficientsRgbY[0][0]);
2491 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2495 + const uint8* u_buf,
2496 + const uint8* v_buf,
2500 + int16 *kCoefficientsRgbY);
2504 -#if defined(OS_MACOSX)
2505 -"_PICLinearScaleYUVToRGB32Row:\n"
2506 +#if defined(XP_MACOSX)
2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2509 -"PICLinearScaleYUVToRGB32Row:\n"
2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2513 "mov 0x24(%esp),%edx\n"
2514 "mov 0x30(%esp),%ebp\n"
2515 "mov 0x34(%esp),%ecx\n"
2516 "mov 0x3c(%esp),%edi\n"
2519 // source_width = width * source_dx + ebx
2520 "mov 0x34(%esp), %ecx\n"
2521 "imull 0x38(%esp), %ecx\n"
2522 "mov %ecx, 0x34(%esp)\n"
2524 "mov 0x38(%esp), %ecx\n"
2525 "xor %ebx,%ebx\n" // x = 0
2526 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2529 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2530 - "jmp .lscaleend\n"
2536 "mov 0x28(%esp),%esi\n"
2540 "movzbl (%esi,%eax,1),%ecx\n"
2541 "movzbl 1(%esi,%eax,1),%esi\n"
2543 "andl $0x1fffe, %eax \n"
2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2545 "imul %eax, %esi \n"
2546 "xorl $0xffff, %eax \n"
2547 "imul %eax, %ecx \n"
2548 "addl %esi, %ecx \n"
2550 "movq (%edi,%ecx,8),%mm1\n"
2552 "cmp 0x34(%esp), %ebx\n"
2553 - "jge .lscalelastpixel\n"
2558 "movzbl (%edx,%eax,1),%ecx\n"
2559 "movzbl 1(%edx,%eax,1),%esi\n"
2561 "add 0x38(%esp),%ebx\n"
2562 "andl $0xffff, %eax \n"
2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2564 "paddsw %mm0,%mm1\n"
2565 "paddsw %mm0,%mm2\n"
2568 "packuswb %mm2,%mm1\n"
2569 "movntq %mm1,0x0(%ebp)\n"
2574 "cmp %ebx, 0x34(%esp)\n"
2575 - "jg .lscaleloop\n"
2580 -".lscalelastpixel:"
2582 "paddsw %mm0, %mm1\n"
2584 "packuswb %mm1, %mm1\n"
2585 "movd %mm1, (%ebp)\n"
2588 +#if !defined(XP_MACOSX)
2594 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2595 - const uint8* u_buf,
2596 - const uint8* v_buf,
2600 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2601 - &kCoefficientsRgbY[0][0]);
2606 -// C reference code that mimic the YUV assembly.
2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2609 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2611 -static inline void YuvPixel(uint8 y,
2616 - int b = kCoefficientsRgbY[256+u][0];
2617 - int g = kCoefficientsRgbY[256+u][1];
2618 - int r = kCoefficientsRgbY[256+u][2];
2619 - int a = kCoefficientsRgbY[256+u][3];
2621 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2622 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2623 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2624 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2626 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2627 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2628 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2629 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2636 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2637 - (packuswb(g) << 8) |
2638 - (packuswb(r) << 16) |
2639 - (packuswb(a) << 24);
2642 + const uint8* u_buf,
2643 + const uint8* v_buf,
2648 + if (mozilla::supports_sse()) {
2649 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2650 + source_dx, &kCoefficientsRgbY[0][0]);
2654 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2657 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2662 - for (int x = 0; x < width; x += 2) {
2663 - uint8 u = u_buf[x >> 1];
2664 - uint8 v = v_buf[x >> 1];
2665 - uint8 y0 = y_buf[x];
2666 - YuvPixel(y0, u, v, rgb_buf);
2667 - if ((x + 1) < width) {
2668 - uint8 y1 = y_buf[x + 1];
2669 - YuvPixel(y1, u, v, rgb_buf + 4);
2671 - rgb_buf += 8; // Advance 2 pixels.
2675 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
2676 -// A shift by 17 is used to further subsample the chrominence channels.
2677 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
2678 -// for 1/65536 pixel accurate interpolation.
2679 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2682 void ScaleYUVToRGB32Row(const uint8* y_buf,
2689 - for (int i = 0; i < width; i += 2) {
2690 - int y = y_buf[x >> 16];
2691 - int u = u_buf[(x >> 17)];
2692 - int v = v_buf[(x >> 17)];
2693 - YuvPixel(y, u, v, rgb_buf);
2695 - if ((i + 1) < width) {
2696 - y = y_buf[x >> 16];
2697 - YuvPixel(y, u, v, rgb_buf+4);
2703 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2706 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2713 - if (source_dx >= 0x20000) {
2716 - for (int i = 0; i < width; i += 2) {
2717 - int y0 = y_buf[x >> 16];
2718 - int y1 = y_buf[(x >> 16) + 1];
2719 - int u0 = u_buf[(x >> 17)];
2720 - int u1 = u_buf[(x >> 17) + 1];
2721 - int v0 = v_buf[(x >> 17)];
2722 - int v1 = v_buf[(x >> 17) + 1];
2723 - int y_frac = (x & 65535);
2724 - int uv_frac = ((x >> 1) & 65535);
2725 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2726 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2727 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2728 - YuvPixel(y, u, v, rgb_buf);
2730 - if ((i + 1) < width) {
2731 - y0 = y_buf[x >> 16];
2732 - y1 = y_buf[(x >> 16) + 1];
2733 - y_frac = (x & 65535);
2734 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2735 - YuvPixel(y, u, v, rgb_buf+4);
2745 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2751 --- a/gfx/ycbcr/yuv_row_table.cpp
2752 +++ b/gfx/ycbcr/yuv_row_table.cpp
2754 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2755 // Use of this source code is governed by a BSD-style license that can be
2756 // found in the LICENSE file.
2758 -#include "media/base/yuv_row.h"
2759 +#include "yuv_row.h"
2764 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2765 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2766 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2769 --- a/gfx/ycbcr/yuv_row_win.cpp
2770 +++ b/gfx/ycbcr/yuv_row_win.cpp
2772 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2773 // Use of this source code is governed by a BSD-style license that can be
2774 // found in the LICENSE file.
2776 -#include "media/base/yuv_row.h"
2777 +#include "yuv_row.h"
2778 +#include "mozilla/SSE.h"
2780 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2781 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2788 - const uint8* u_buf,
2789 - const uint8* v_buf,
2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2795 + const uint8* u_buf,
2796 + const uint8* v_buf,
2801 mov edx, [esp + 32 + 4] // Y
2802 mov edi, [esp + 32 + 8] // U
2803 mov esi, [esp + 32 + 12] // V
2804 mov ebp, [esp + 32 + 16] // rgb
2805 mov ecx, [esp + 32 + 20] // width
2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2817 - const uint8* u_buf,
2818 - const uint8* v_buf,
2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2823 + const uint8* u_buf,
2824 + const uint8* v_buf,
2830 mov edx, [esp + 32 + 4] // Y
2831 mov edi, [esp + 32 + 8] // U
2832 mov esi, [esp + 32 + 12] // V
2833 mov ebp, [esp + 32 + 16] // rgb
2834 mov ecx, [esp + 32 + 20] // width
2835 mov ebx, [esp + 32 + 24] // step
2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2846 - const uint8* u_buf,
2847 - const uint8* v_buf,
2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2853 + const uint8* u_buf,
2854 + const uint8* v_buf,
2861 mov edx, [esp + 32 + 4] // Y
2862 mov edi, [esp + 32 + 8] // U
2863 mov esi, [esp + 32 + 12] // V
2864 mov ebp, [esp + 32 + 16] // rgb
2865 mov ecx, [esp + 32 + 20] // width
2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2877 - const uint8* u_buf,
2878 - const uint8* v_buf,
2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2882 + const uint8* u_buf,
2883 + const uint8* v_buf,
2888 mov edx, [esp + 32 + 4] // Y
2889 mov edi, [esp + 32 + 8] // U
2890 mov esi, [esp + 32 + 12] // V
2891 mov ebp, [esp + 32 + 16] // rgb
2892 mov ecx, [esp + 32 + 20] // width
2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2902 // This version does general purpose scaling by any amount, up or down.
2903 -// The only thing it can not do it rotation by 90 or 270.
2904 -// For performance the chroma is under sampled, reducing cost of a 3x
2905 +// The only thing it cannot do is rotation by 90 or 270.
2906 +// For performance the chroma is under-sampled, reducing cost of a 3x
2907 // 1080p scale from 8.4 ms to 5.4 ms.
2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2910 - const uint8* u_buf,
2911 - const uint8* v_buf,
2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2916 + const uint8* u_buf,
2917 + const uint8* v_buf,
2923 mov edx, [esp + 32 + 4] // Y
2924 mov edi, [esp + 32 + 8] // U
2925 mov esi, [esp + 32 + 12] // V
2926 mov ebp, [esp + 32 + 16] // rgb
2927 mov ecx, [esp + 32 + 20] // width
2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2939 - const uint8* u_buf,
2940 - const uint8* v_buf,
2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2945 + const uint8* u_buf,
2946 + const uint8* v_buf,
2952 mov edx, [esp + 32 + 4] // Y
2953 mov edi, [esp + 32 + 8] // U
2954 // [esp + 32 + 12] // V
2955 mov ebp, [esp + 32 + 16] // rgb
2956 mov ecx, [esp + 32 + 20] // width
2957 imul ecx, [esp + 32 + 24] // source_dx
2958 @@ -438,152 +439,60 @@ lscalelastpixel:
2969 -// C reference code that mimic the YUV assembly.
2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2972 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2974 -static inline void YuvPixel(uint8 y,
2979 - int b = kCoefficientsRgbY[256+u][0];
2980 - int g = kCoefficientsRgbY[256+u][1];
2981 - int r = kCoefficientsRgbY[256+u][2];
2982 - int a = kCoefficientsRgbY[256+u][3];
2984 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2985 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2986 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2987 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2989 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2990 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2991 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2992 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2999 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3000 - (packuswb(g) << 8) |
3001 - (packuswb(r) << 16) |
3002 - (packuswb(a) << 24);
3006 -static inline void YuvPixel(uint8 y,
3013 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3015 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3017 - movq mm1, [kCoefficientsRgbY + 8 * eax]
3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3029 void FastConvertYUVToRGB32Row(const uint8* y_buf,
3034 - for (int x = 0; x < width; x += 2) {
3035 - uint8 u = u_buf[x >> 1];
3036 - uint8 v = v_buf[x >> 1];
3037 - uint8 y0 = y_buf[x];
3038 - YuvPixel(y0, u, v, rgb_buf);
3039 - if ((x + 1) < width) {
3040 - uint8 y1 = y_buf[x + 1];
3041 - YuvPixel(y1, u, v, rgb_buf + 4);
3043 - rgb_buf += 8; // Advance 2 pixels.
3047 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
3048 -// A shift by 17 is used to further subsample the chrominence channels.
3049 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
3050 -// for 1/65536 pixel accurate interpolation.
3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3052 + if (mozilla::supports_sse()) {
3053 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3058 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3061 void ScaleYUVToRGB32Row(const uint8* y_buf,
3068 - for (int i = 0; i < width; i += 2) {
3069 - int y = y_buf[x >> 16];
3070 - int u = u_buf[(x >> 17)];
3071 - int v = v_buf[(x >> 17)];
3072 - YuvPixel(y, u, v, rgb_buf);
3074 - if ((i + 1) < width) {
3075 - y = y_buf[x >> 16];
3076 - YuvPixel(y, u, v, rgb_buf+4);
3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3084 + if (mozilla::supports_sse()) {
3085 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3090 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3093 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3100 - if (source_dx >= 0x20000) {
3103 - for (int i = 0; i < width; i += 2) {
3104 - int y0 = y_buf[x >> 16];
3105 - int y1 = y_buf[(x >> 16) + 1];
3106 - int u0 = u_buf[(x >> 17)];
3107 - int u1 = u_buf[(x >> 17) + 1];
3108 - int v0 = v_buf[(x >> 17)];
3109 - int v1 = v_buf[(x >> 17) + 1];
3110 - int y_frac = (x & 65535);
3111 - int uv_frac = ((x >> 1) & 65535);
3112 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3113 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3114 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3115 - YuvPixel(y, u, v, rgb_buf);
3117 - if ((i + 1) < width) {
3118 - y0 = y_buf[x >> 16];
3119 - y1 = y_buf[(x >> 16) + 1];
3120 - y_frac = (x & 65535);
3121 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3122 - YuvPixel(y, u, v, rgb_buf+4);
3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3133 + if (mozilla::supports_sse()) {
3134 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3140 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);