Bumping manifests a=b2g-bump
[gecko.git] / gfx / ycbcr / convert.patch
blobebf6c3525dbc8e7da0517f7636dad536792473b0
1 diff --git a/gfx/ycbcr/yuv_convert.cpp b/gfx/ycbcr/yuv_convert.cpp
2 --- a/gfx/ycbcr/yuv_convert.cpp
3 +++ b/gfx/ycbcr/yuv_convert.cpp
4 @@ -6,145 +6,102 @@
5 // http://www.fourcc.org/yuv.php
6 // The actual conversion is best described here
7 // http://en.wikipedia.org/wiki/YUV
8 // An article on optimizing YUV conversion using tables instead of multiplies
9 // http://lestourtereaux.free.fr/papers/data/yuvrgb.pdf
11 // YV12 is a full plane of Y and a half height, half width chroma planes
12 // YV16 is a full plane of Y and a full height, half width chroma planes
13 +// YV24 is a full plane of Y and a full height, full width chroma planes
15 // ARGB pixel format is output, which on little endian is stored as BGRA.
16 // The alpha is set to 255, allowing the application to use RGBA or RGB32.
18 -#include "media/base/yuv_convert.h"
19 +#include "yuv_convert.h"
21 // Header for low level row functions.
22 -#include "media/base/yuv_row.h"
24 -#if USE_MMX
25 -#if defined(_MSC_VER)
26 -#include <intrin.h>
27 -#else
28 -#include <mmintrin.h>
29 -#endif
30 -#endif
32 -#if USE_SSE2
33 -#include <emmintrin.h>
34 -#endif
36 -namespace media {
38 +#include "yuv_row.h"
39 +#include "mozilla/SSE.h"
41 +namespace mozilla {
43 +namespace gfx {
45 // 16.16 fixed point arithmetic
46 const int kFractionBits = 16;
47 const int kFractionMax = 1 << kFractionBits;
48 const int kFractionMask = ((1 << kFractionBits) - 1);
50 // Convert a frame of YUV to 32 bit ARGB.
51 -void ConvertYUVToRGB32(const uint8* y_buf,
52 - const uint8* u_buf,
53 - const uint8* v_buf,
54 - uint8* rgb_buf,
55 - int width,
56 - int height,
57 - int y_pitch,
58 - int uv_pitch,
59 - int rgb_pitch,
60 - YUVType yuv_type) {
61 - unsigned int y_shift = yuv_type;
62 - for (int y = 0; y < height; ++y) {
63 - uint8* rgb_row = rgb_buf + y * rgb_pitch;
64 - const uint8* y_ptr = y_buf + y * y_pitch;
65 - const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch;
66 - const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch;
68 - FastConvertYUVToRGB32Row(y_ptr,
69 - u_ptr,
70 - v_ptr,
71 - rgb_row,
72 - width);
73 - }
74 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* y_buf,
75 + const uint8* u_buf,
76 + const uint8* v_buf,
77 + uint8* rgb_buf,
78 + int pic_x,
79 + int pic_y,
80 + int pic_width,
81 + int pic_height,
82 + int y_pitch,
83 + int uv_pitch,
84 + int rgb_pitch,
85 + YUVType yuv_type) {
86 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
87 + unsigned int x_shift = yuv_type == YV24 ? 0 : 1;
88 + // Test for SSE because the optimized code uses movntq, which is not part of MMX.
89 + bool has_sse = supports_mmx() && supports_sse();
90 + // There is no optimized YV24 SSE routine so we check for this and
91 + // fall back to the C code.
92 + has_sse &= yuv_type != YV24;
93 + bool odd_pic_x = yuv_type != YV24 && pic_x % 2 != 0;
94 + int x_width = odd_pic_x ? pic_width - 1 : pic_width;
96 + for (int y = pic_y; y < pic_height + pic_y; ++y) {
97 + uint8* rgb_row = rgb_buf + (y - pic_y) * rgb_pitch;
98 + const uint8* y_ptr = y_buf + y * y_pitch + pic_x;
99 + const uint8* u_ptr = u_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
100 + const uint8* v_ptr = v_buf + (y >> y_shift) * uv_pitch + (pic_x >> x_shift);
102 + if (odd_pic_x) {
103 + // Handle the single odd pixel manually and use the
104 + // fast routines for the remaining.
105 + FastConvertYUVToRGB32Row_C(y_ptr++,
106 + u_ptr++,
107 + v_ptr++,
108 + rgb_row,
109 + 1,
110 + x_shift);
111 + rgb_row += 4;
114 + if (has_sse) {
115 + FastConvertYUVToRGB32Row(y_ptr,
116 + u_ptr,
117 + v_ptr,
118 + rgb_row,
119 + x_width);
121 + else {
122 + FastConvertYUVToRGB32Row_C(y_ptr,
123 + u_ptr,
124 + v_ptr,
125 + rgb_row,
126 + x_width,
127 + x_shift);
131 // MMX used for FastConvertYUVToRGB32Row requires emms instruction.
132 - EMMS();
135 -#if USE_SSE2
136 -// FilterRows combines two rows of the image using linear interpolation.
137 -// SSE2 version does 16 pixels at a time
139 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
140 - int source_width, int source_y_fraction) {
141 - __m128i zero = _mm_setzero_si128();
142 - __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
143 - __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
145 - const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
146 - const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
147 - __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
148 - __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
150 - do {
151 - __m128i y0 = _mm_loadu_si128(y0_ptr128);
152 - __m128i y1 = _mm_loadu_si128(y1_ptr128);
153 - __m128i y2 = _mm_unpackhi_epi8(y0, zero);
154 - __m128i y3 = _mm_unpackhi_epi8(y1, zero);
155 - y0 = _mm_unpacklo_epi8(y0, zero);
156 - y1 = _mm_unpacklo_epi8(y1, zero);
157 - y0 = _mm_mullo_epi16(y0, y0_fraction);
158 - y1 = _mm_mullo_epi16(y1, y1_fraction);
159 - y2 = _mm_mullo_epi16(y2, y0_fraction);
160 - y3 = _mm_mullo_epi16(y3, y1_fraction);
161 - y0 = _mm_add_epi16(y0, y1);
162 - y2 = _mm_add_epi16(y2, y3);
163 - y0 = _mm_srli_epi16(y0, 8);
164 - y2 = _mm_srli_epi16(y2, 8);
165 - y0 = _mm_packus_epi16(y0, y2);
166 - *dest128++ = y0;
167 - ++y0_ptr128;
168 - ++y1_ptr128;
169 - } while (dest128 < end128);
171 -#elif USE_MMX
172 -// MMX version does 8 pixels at a time
173 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
174 - int source_width, int source_y_fraction) {
175 - __m64 zero = _mm_setzero_si64();
176 - __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
177 - __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
179 - const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
180 - const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
181 - __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
182 - __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
184 - do {
185 - __m64 y0 = *y0_ptr64++;
186 - __m64 y1 = *y1_ptr64++;
187 - __m64 y2 = _mm_unpackhi_pi8(y0, zero);
188 - __m64 y3 = _mm_unpackhi_pi8(y1, zero);
189 - y0 = _mm_unpacklo_pi8(y0, zero);
190 - y1 = _mm_unpacklo_pi8(y1, zero);
191 - y0 = _mm_mullo_pi16(y0, y0_fraction);
192 - y1 = _mm_mullo_pi16(y1, y1_fraction);
193 - y2 = _mm_mullo_pi16(y2, y0_fraction);
194 - y3 = _mm_mullo_pi16(y3, y1_fraction);
195 - y0 = _mm_add_pi16(y0, y1);
196 - y2 = _mm_add_pi16(y2, y3);
197 - y0 = _mm_srli_pi16(y0, 8);
198 - y2 = _mm_srli_pi16(y2, 8);
199 - y0 = _mm_packs_pu16(y0, y2);
200 - *dest64++ = y0;
201 - } while (dest64 < end64);
203 -#else // no MMX or SSE2
204 + if (has_sse)
205 + EMMS();
208 // C version does 8 at a time to mimic MMX code
209 -static void FilterRows(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
210 - int source_width, int source_y_fraction) {
211 +static void FilterRows_C(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
212 + int source_width, int source_y_fraction) {
213 int y1_fraction = source_y_fraction;
214 int y0_fraction = 256 - y1_fraction;
215 uint8* end = ybuf + source_width;
216 do {
217 ybuf[0] = (y0_ptr[0] * y0_fraction + y1_ptr[0] * y1_fraction) >> 8;
218 ybuf[1] = (y0_ptr[1] * y0_fraction + y1_ptr[1] * y1_fraction) >> 8;
219 ybuf[2] = (y0_ptr[2] * y0_fraction + y1_ptr[2] * y1_fraction) >> 8;
220 ybuf[3] = (y0_ptr[3] * y0_fraction + y1_ptr[3] * y1_fraction) >> 8;
221 @@ -152,46 +140,77 @@ static void FilterRows(uint8* ybuf, cons
222 ybuf[5] = (y0_ptr[5] * y0_fraction + y1_ptr[5] * y1_fraction) >> 8;
223 ybuf[6] = (y0_ptr[6] * y0_fraction + y1_ptr[6] * y1_fraction) >> 8;
224 ybuf[7] = (y0_ptr[7] * y0_fraction + y1_ptr[7] * y1_fraction) >> 8;
225 y0_ptr += 8;
226 y1_ptr += 8;
227 ybuf += 8;
228 } while (ybuf < end);
230 -#endif
232 +#ifdef MOZILLA_MAY_SUPPORT_MMX
233 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
234 + int source_width, int source_y_fraction);
235 +#endif
237 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
238 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
239 + int source_width, int source_y_fraction);
240 +#endif
242 +static inline void FilterRows(uint8* ybuf, const uint8* y0_ptr,
243 + const uint8* y1_ptr, int source_width,
244 + int source_y_fraction) {
245 +#ifdef MOZILLA_MAY_SUPPORT_SSE2
246 + if (mozilla::supports_sse2()) {
247 + FilterRows_SSE2(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
248 + return;
250 +#endif
252 +#ifdef MOZILLA_MAY_SUPPORT_MMX
253 + if (mozilla::supports_mmx()) {
254 + FilterRows_MMX(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
255 + return;
257 +#endif
259 + FilterRows_C(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
263 // Scale a frame of YUV to 32 bit ARGB.
264 -void ScaleYUVToRGB32(const uint8* y_buf,
265 - const uint8* u_buf,
266 - const uint8* v_buf,
267 - uint8* rgb_buf,
268 - int source_width,
269 - int source_height,
270 - int width,
271 - int height,
272 - int y_pitch,
273 - int uv_pitch,
274 - int rgb_pitch,
275 - YUVType yuv_type,
276 - Rotate view_rotate,
277 - ScaleFilter filter) {
278 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* y_buf,
279 + const uint8* u_buf,
280 + const uint8* v_buf,
281 + uint8* rgb_buf,
282 + int source_width,
283 + int source_height,
284 + int width,
285 + int height,
286 + int y_pitch,
287 + int uv_pitch,
288 + int rgb_pitch,
289 + YUVType yuv_type,
290 + Rotate view_rotate,
291 + ScaleFilter filter) {
292 + bool has_mmx = supports_mmx();
294 // 4096 allows 3 buffers to fit in 12k.
295 // Helps performance on CPU with 16K L1 cache.
296 // Large enough for 3830x2160 and 30" displays which are 2560x1600.
297 const int kFilterBufferSize = 4096;
298 // Disable filtering if the screen is too big (to avoid buffer overflows).
299 // This should never happen to regular users: they don't have monitors
300 // wider than 4096 pixels.
301 // TODO(fbarchard): Allow rotated videos to filter.
302 if (source_width > kFilterBufferSize || view_rotate)
303 filter = FILTER_NONE;
305 - unsigned int y_shift = yuv_type;
306 + unsigned int y_shift = yuv_type == YV12 ? 1 : 0;
307 // Diagram showing origin and direction of source sampling.
308 // ->0 4<-
309 // 7 3
311 // 6 5
312 // ->1 2<-
313 // Rotations that start at right side of image.
314 if ((view_rotate == ROTATE_180) ||
315 @@ -276,17 +295,17 @@ void ScaleYUVToRGB32(const uint8* y_buf,
316 int source_uv_fraction =
317 ((source_y_subpixel >> y_shift) & kFractionMask) >> 8;
319 const uint8* y_ptr = y0_ptr;
320 const uint8* u_ptr = u0_ptr;
321 const uint8* v_ptr = v0_ptr;
322 // Apply vertical filtering if necessary.
323 // TODO(fbarchard): Remove memcpy when not necessary.
324 - if (filter & media::FILTER_BILINEAR_V) {
325 + if (filter & mozilla::gfx::FILTER_BILINEAR_V) {
326 if (yscale_fixed != kFractionMax &&
327 source_y_fraction && ((source_y + 1) < source_height)) {
328 FilterRows(ybuf, y0_ptr, y1_ptr, source_width, source_y_fraction);
329 } else {
330 memcpy(ybuf, y0_ptr, source_width);
332 y_ptr = ybuf;
333 ybuf[source_width] = ybuf[source_width-1];
334 @@ -303,44 +322,50 @@ void ScaleYUVToRGB32(const uint8* y_buf,
335 u_ptr = ubuf;
336 v_ptr = vbuf;
337 ubuf[uv_source_width] = ubuf[uv_source_width - 1];
338 vbuf[uv_source_width] = vbuf[uv_source_width - 1];
340 if (source_dx == kFractionMax) { // Not scaled
341 FastConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
342 dest_pixel, width);
343 - } else {
344 - if (filter & FILTER_BILINEAR_H) {
345 + } else if (filter & FILTER_BILINEAR_H) {
346 LinearScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
347 dest_pixel, width, source_dx);
348 } else {
349 // Specialized scalers and rotation.
350 -#if USE_MMX && defined(_MSC_VER)
351 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_MSC_VER) && defined(_M_IX86)
352 + if(mozilla::supports_sse()) {
353 if (width == (source_width * 2)) {
354 - DoubleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
355 - dest_pixel, width);
356 + DoubleYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
357 + dest_pixel, width);
358 } else if ((source_dx & kFractionMask) == 0) {
359 // Scaling by integer scale factor. ie half.
360 - ConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
361 - dest_pixel, width,
362 - source_dx >> kFractionBits);
363 + ConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
364 + dest_pixel, width,
365 + source_dx >> kFractionBits);
366 } else if (source_dx_uv == source_dx) { // Not rotated.
367 ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
368 dest_pixel, width, source_dx);
369 } else {
370 - RotateConvertYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
371 - dest_pixel, width,
372 - source_dx >> kFractionBits,
373 - source_dx_uv >> kFractionBits);
374 + RotateConvertYUVToRGB32Row_SSE(y_ptr, u_ptr, v_ptr,
375 + dest_pixel, width,
376 + source_dx >> kFractionBits,
377 + source_dx_uv >> kFractionBits);
380 + else {
381 + ScaleYUVToRGB32Row_C(y_ptr, u_ptr, v_ptr,
382 + dest_pixel, width, source_dx);
384 #else
385 - ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
386 - dest_pixel, width, source_dx);
387 -#endif
389 + ScaleYUVToRGB32Row(y_ptr, u_ptr, v_ptr,
390 + dest_pixel, width, source_dx);
391 +#endif
394 // MMX used for FastConvertYUVToRGB32Row and FilterRows requires emms.
395 - EMMS();
398 -} // namespace media
399 + if (has_mmx)
400 + EMMS();
403 +} // namespace gfx
404 +} // namespace mozilla
405 diff --git a/gfx/ycbcr/yuv_convert.h b/gfx/ycbcr/yuv_convert.h
406 --- a/gfx/ycbcr/yuv_convert.h
407 +++ b/gfx/ycbcr/yuv_convert.h
408 @@ -1,72 +1,79 @@
409 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
410 // Use of this source code is governed by a BSD-style license that can be
411 // found in the LICENSE file.
413 #ifndef MEDIA_BASE_YUV_CONVERT_H_
414 #define MEDIA_BASE_YUV_CONVERT_H_
416 -#include "base/basictypes.h"
418 -namespace media {
420 +#include "chromium_types.h"
421 +#include "gfxCore.h"
423 +namespace mozilla {
425 +namespace gfx {
427 // Type of YUV surface.
428 // The value of these enums matter as they are used to shift vertical indices.
429 enum YUVType {
430 - YV16 = 0, // YV16 is half width and full height chroma channels.
431 - YV12 = 1, // YV12 is half width and half height chroma channels.
432 + YV12 = 0, // YV12 is half width and half height chroma channels.
433 + YV16 = 1, // YV16 is half width and full height chroma channels.
434 + YV24 = 2 // YV24 is full width and full height chroma channels.
437 // Mirror means flip the image horizontally, as in looking in a mirror.
438 // Rotate happens after mirroring.
439 enum Rotate {
440 ROTATE_0, // Rotation off.
441 ROTATE_90, // Rotate clockwise.
442 ROTATE_180, // Rotate upside down.
443 ROTATE_270, // Rotate counter clockwise.
444 MIRROR_ROTATE_0, // Mirror horizontally.
445 MIRROR_ROTATE_90, // Mirror then Rotate clockwise.
446 MIRROR_ROTATE_180, // Mirror vertically.
447 - MIRROR_ROTATE_270, // Transpose.
448 + MIRROR_ROTATE_270 // Transpose.
451 // Filter affects how scaling looks.
452 enum ScaleFilter {
453 FILTER_NONE = 0, // No filter (point sampled).
454 FILTER_BILINEAR_H = 1, // Bilinear horizontal filter.
455 FILTER_BILINEAR_V = 2, // Bilinear vertical filter.
456 - FILTER_BILINEAR = 3, // Bilinear filter.
457 + FILTER_BILINEAR = 3 // Bilinear filter.
460 // Convert a frame of YUV to 32 bit ARGB.
461 // Pass in YV16/YV12 depending on source format
462 -void ConvertYUVToRGB32(const uint8* yplane,
463 - const uint8* uplane,
464 - const uint8* vplane,
465 - uint8* rgbframe,
466 - int width,
467 - int height,
468 - int ystride,
469 - int uvstride,
470 - int rgbstride,
471 - YUVType yuv_type);
472 +NS_GFX_(void) ConvertYCbCrToRGB32(const uint8* yplane,
473 + const uint8* uplane,
474 + const uint8* vplane,
475 + uint8* rgbframe,
476 + int pic_x,
477 + int pic_y,
478 + int pic_width,
479 + int pic_height,
480 + int ystride,
481 + int uvstride,
482 + int rgbstride,
483 + YUVType yuv_type);
485 // Scale a frame of YUV to 32 bit ARGB.
486 // Supports rotation and mirroring.
487 -void ScaleYUVToRGB32(const uint8* yplane,
488 - const uint8* uplane,
489 - const uint8* vplane,
490 - uint8* rgbframe,
491 - int source_width,
492 - int source_height,
493 - int width,
494 - int height,
495 - int ystride,
496 - int uvstride,
497 - int rgbstride,
498 - YUVType yuv_type,
499 - Rotate view_rotate,
500 - ScaleFilter filter);
502 -} // namespace media
504 +NS_GFX_(void) ScaleYCbCrToRGB32(const uint8* yplane,
505 + const uint8* uplane,
506 + const uint8* vplane,
507 + uint8* rgbframe,
508 + int source_width,
509 + int source_height,
510 + int width,
511 + int height,
512 + int ystride,
513 + int uvstride,
514 + int rgbstride,
515 + YUVType yuv_type,
516 + Rotate view_rotate,
517 + ScaleFilter filter);
519 +} // namespace gfx
520 +} // namespace mozilla
522 #endif // MEDIA_BASE_YUV_CONVERT_H_
523 diff --git a/gfx/ycbcr/yuv_convert_mmx.cpp b/gfx/ycbcr/yuv_convert_mmx.cpp
524 new file mode 100644
525 --- /dev/null
526 +++ b/gfx/ycbcr/yuv_convert_mmx.cpp
527 @@ -0,0 +1,45 @@
528 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
529 +// Use of this source code is governed by a BSD-style license that can be
530 +// found in the LICENSE file.
532 +#include <mmintrin.h>
533 +#include "yuv_row.h"
535 +namespace mozilla {
536 +namespace gfx {
538 +// FilterRows combines two rows of the image using linear interpolation.
539 +// MMX version does 8 pixels at a time.
540 +void FilterRows_MMX(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
541 + int source_width, int source_y_fraction) {
542 + __m64 zero = _mm_setzero_si64();
543 + __m64 y1_fraction = _mm_set1_pi16(source_y_fraction);
544 + __m64 y0_fraction = _mm_set1_pi16(256 - source_y_fraction);
546 + const __m64* y0_ptr64 = reinterpret_cast<const __m64*>(y0_ptr);
547 + const __m64* y1_ptr64 = reinterpret_cast<const __m64*>(y1_ptr);
548 + __m64* dest64 = reinterpret_cast<__m64*>(ybuf);
549 + __m64* end64 = reinterpret_cast<__m64*>(ybuf + source_width);
551 + do {
552 + __m64 y0 = *y0_ptr64++;
553 + __m64 y1 = *y1_ptr64++;
554 + __m64 y2 = _mm_unpackhi_pi8(y0, zero);
555 + __m64 y3 = _mm_unpackhi_pi8(y1, zero);
556 + y0 = _mm_unpacklo_pi8(y0, zero);
557 + y1 = _mm_unpacklo_pi8(y1, zero);
558 + y0 = _mm_mullo_pi16(y0, y0_fraction);
559 + y1 = _mm_mullo_pi16(y1, y1_fraction);
560 + y2 = _mm_mullo_pi16(y2, y0_fraction);
561 + y3 = _mm_mullo_pi16(y3, y1_fraction);
562 + y0 = _mm_add_pi16(y0, y1);
563 + y2 = _mm_add_pi16(y2, y3);
564 + y0 = _mm_srli_pi16(y0, 8);
565 + y2 = _mm_srli_pi16(y2, 8);
566 + y0 = _mm_packs_pu16(y0, y2);
567 + *dest64++ = y0;
568 + } while (dest64 < end64);
573 diff --git a/gfx/ycbcr/yuv_convert_sse2.cpp b/gfx/ycbcr/yuv_convert_sse2.cpp
574 new file mode 100644
575 --- /dev/null
576 +++ b/gfx/ycbcr/yuv_convert_sse2.cpp
577 @@ -0,0 +1,47 @@
578 +// Copyright (c) 2010 The Chromium Authors. All rights reserved.
579 +// Use of this source code is governed by a BSD-style license that can be
580 +// found in the LICENSE file.
582 +#include <emmintrin.h>
583 +#include "yuv_row.h"
585 +namespace mozilla {
586 +namespace gfx {
588 +// FilterRows combines two rows of the image using linear interpolation.
589 +// SSE2 version does 16 pixels at a time.
590 +void FilterRows_SSE2(uint8* ybuf, const uint8* y0_ptr, const uint8* y1_ptr,
591 + int source_width, int source_y_fraction) {
592 + __m128i zero = _mm_setzero_si128();
593 + __m128i y1_fraction = _mm_set1_epi16(source_y_fraction);
594 + __m128i y0_fraction = _mm_set1_epi16(256 - source_y_fraction);
596 + const __m128i* y0_ptr128 = reinterpret_cast<const __m128i*>(y0_ptr);
597 + const __m128i* y1_ptr128 = reinterpret_cast<const __m128i*>(y1_ptr);
598 + __m128i* dest128 = reinterpret_cast<__m128i*>(ybuf);
599 + __m128i* end128 = reinterpret_cast<__m128i*>(ybuf + source_width);
601 + do {
602 + __m128i y0 = _mm_loadu_si128(y0_ptr128);
603 + __m128i y1 = _mm_loadu_si128(y1_ptr128);
604 + __m128i y2 = _mm_unpackhi_epi8(y0, zero);
605 + __m128i y3 = _mm_unpackhi_epi8(y1, zero);
606 + y0 = _mm_unpacklo_epi8(y0, zero);
607 + y1 = _mm_unpacklo_epi8(y1, zero);
608 + y0 = _mm_mullo_epi16(y0, y0_fraction);
609 + y1 = _mm_mullo_epi16(y1, y1_fraction);
610 + y2 = _mm_mullo_epi16(y2, y0_fraction);
611 + y3 = _mm_mullo_epi16(y3, y1_fraction);
612 + y0 = _mm_add_epi16(y0, y1);
613 + y2 = _mm_add_epi16(y2, y3);
614 + y0 = _mm_srli_epi16(y0, 8);
615 + y2 = _mm_srli_epi16(y2, 8);
616 + y0 = _mm_packus_epi16(y0, y2);
617 + *dest128++ = y0;
618 + ++y0_ptr128;
619 + ++y1_ptr128;
620 + } while (dest128 < end128);
625 diff --git a/gfx/ycbcr/yuv_row.h b/gfx/ycbcr/yuv_row.h
626 --- a/gfx/ycbcr/yuv_row.h
627 +++ b/gfx/ycbcr/yuv_row.h
628 @@ -5,109 +5,133 @@
629 // yuv_row internal functions to handle YUV conversion and scaling to RGB.
630 // These functions are used from both yuv_convert.cc and yuv_scale.cc.
632 // TODO(fbarchard): Write function that can handle rotation and scaling.
634 #ifndef MEDIA_BASE_YUV_ROW_H_
635 #define MEDIA_BASE_YUV_ROW_H_
637 -#include "base/basictypes.h"
638 +#include "chromium_types.h"
640 extern "C" {
641 // Can only do 1x.
642 // This is the second fastest of the scalers.
643 void FastConvertYUVToRGB32Row(const uint8* y_buf,
644 const uint8* u_buf,
645 const uint8* v_buf,
646 uint8* rgb_buf,
647 int width);
649 -// Can do 1x, half size or any scale down by an integer amount.
650 -// Step can be negative (mirroring, rotate 180).
651 -// This is the third fastest of the scalers.
652 -void ConvertYUVToRGB32Row(const uint8* y_buf,
653 - const uint8* u_buf,
654 - const uint8* v_buf,
655 - uint8* rgb_buf,
656 - int width,
657 - int step);
659 -// Rotate is like Convert, but applies different step to Y versus U and V.
660 -// This allows rotation by 90 or 270, by stepping by stride.
661 -// This is the forth fastest of the scalers.
662 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
663 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
664 const uint8* u_buf,
665 const uint8* v_buf,
666 uint8* rgb_buf,
667 int width,
668 - int ystep,
669 - int uvstep);
670 + unsigned int x_shift);
672 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
673 + const uint8* u_buf,
674 + const uint8* v_buf,
675 + uint8* rgb_buf,
676 + int width);
678 +// Can do 1x, half size or any scale down by an integer amount.
679 +// Step can be negative (mirroring, rotate 180).
680 +// This is the third fastest of the scalers.
681 +// Only defined on Windows x86-32.
682 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
683 + const uint8* u_buf,
684 + const uint8* v_buf,
685 + uint8* rgb_buf,
686 + int width,
687 + int step);
689 +// Rotate is like Convert, but applies different step to Y versus U and V.
690 +// This allows rotation by 90 or 270, by stepping by stride.
691 +// This is the forth fastest of the scalers.
692 +// Only defined on Windows x86-32.
693 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
694 + const uint8* u_buf,
695 + const uint8* v_buf,
696 + uint8* rgb_buf,
697 + int width,
698 + int ystep,
699 + int uvstep);
701 // Doubler does 4 pixels at a time. Each pixel is replicated.
702 // This is the fastest of the scalers.
703 -void DoubleYUVToRGB32Row(const uint8* y_buf,
704 - const uint8* u_buf,
705 - const uint8* v_buf,
706 - uint8* rgb_buf,
707 - int width);
708 +// Only defined on Windows x86-32.
709 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
710 + const uint8* u_buf,
711 + const uint8* v_buf,
712 + uint8* rgb_buf,
713 + int width);
715 // Handles arbitrary scaling up or down.
716 // Mirroring is supported, but not 90 or 270 degree rotation.
717 // Chroma is under sampled every 2 pixels for performance.
718 void ScaleYUVToRGB32Row(const uint8* y_buf,
719 const uint8* u_buf,
720 const uint8* v_buf,
721 uint8* rgb_buf,
722 int width,
723 int source_dx);
725 +void ScaleYUVToRGB32Row(const uint8* y_buf,
726 + const uint8* u_buf,
727 + const uint8* v_buf,
728 + uint8* rgb_buf,
729 + int width,
730 + int source_dx);
732 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
733 + const uint8* u_buf,
734 + const uint8* v_buf,
735 + uint8* rgb_buf,
736 + int width,
737 + int source_dx);
739 // Handles arbitrary scaling up or down with bilinear filtering.
740 // Mirroring is supported, but not 90 or 270 degree rotation.
741 // Chroma is under sampled every 2 pixels for performance.
742 // This is the slowest of the scalers.
743 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
744 const uint8* u_buf,
745 const uint8* v_buf,
746 uint8* rgb_buf,
747 int width,
748 int source_dx);
750 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
751 + const uint8* u_buf,
752 + const uint8* v_buf,
753 + uint8* rgb_buf,
754 + int width,
755 + int source_dx);
757 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
758 + const uint8* u_buf,
759 + const uint8* v_buf,
760 + uint8* rgb_buf,
761 + int width,
762 + int source_dx);
765 #if defined(_MSC_VER)
766 #define SIMD_ALIGNED(var) __declspec(align(16)) var
767 #else
768 #define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
769 #endif
770 extern SIMD_ALIGNED(int16 kCoefficientsRgbY[768][4]);
772 -// Method to force C version.
773 -//#define USE_MMX 0
774 -//#define USE_SSE2 0
776 -#if !defined(USE_MMX)
777 -// Windows, Mac and Linux/BSD use MMX
778 -#if defined(__MMX__) || defined(_MSC_VER)
779 -#define USE_MMX 1
780 -#else
781 -#define USE_MMX 0
782 -#endif
783 -#endif
785 -#if !defined(USE_SSE2)
786 -#if defined(__SSE2__) || defined(ARCH_CPU_X86_64) || _M_IX86_FP==2
787 -#define USE_SSE2 1
788 -#else
789 -#define USE_SSE2 0
790 -#endif
791 -#endif
793 // x64 uses MMX2 (SSE) so emms is not required.
794 // Warning C4799: function has no EMMS instruction.
795 // EMMS() is slow and should be called by the calling function once per image.
796 -#if USE_MMX && !defined(ARCH_CPU_X86_64)
797 +#if defined(ARCH_CPU_X86) && !defined(ARCH_CPU_X86_64)
798 #if defined(_MSC_VER)
799 #define EMMS() __asm emms
800 #pragma warning(disable: 4799)
801 #else
802 #define EMMS() asm("emms")
803 #endif
804 #else
805 #define EMMS()
806 diff --git a/gfx/ycbcr/yuv_row_c.cpp b/gfx/ycbcr/yuv_row_c.cpp
807 --- a/gfx/ycbcr/yuv_row_c.cpp
808 +++ b/gfx/ycbcr/yuv_row_c.cpp
809 @@ -1,812 +1,18 @@
810 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
811 // Use of this source code is governed by a BSD-style license that can be
812 // found in the LICENSE file.
814 -#include "media/base/yuv_row.h"
816 -#ifdef _DEBUG
817 -#include "base/logging.h"
818 -#else
819 +#include "yuv_row.h"
821 #define DCHECK(a)
822 -#endif
824 extern "C" {
826 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
828 -// AMD64 ABI uses register paremters.
829 -void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
830 - const uint8* u_buf, // rsi
831 - const uint8* v_buf, // rdx
832 - uint8* rgb_buf, // rcx
833 - int width) { // r8
834 - asm(
835 - "jmp convertend\n"
836 -"convertloop:"
837 - "movzb (%1),%%r10\n"
838 - "add $0x1,%1\n"
839 - "movzb (%2),%%r11\n"
840 - "add $0x1,%2\n"
841 - "movq 2048(%5,%%r10,8),%%xmm0\n"
842 - "movzb (%0),%%r10\n"
843 - "movq 4096(%5,%%r11,8),%%xmm1\n"
844 - "movzb 0x1(%0),%%r11\n"
845 - "paddsw %%xmm1,%%xmm0\n"
846 - "movq (%5,%%r10,8),%%xmm2\n"
847 - "add $0x2,%0\n"
848 - "movq (%5,%%r11,8),%%xmm3\n"
849 - "paddsw %%xmm0,%%xmm2\n"
850 - "paddsw %%xmm0,%%xmm3\n"
851 - "shufps $0x44,%%xmm3,%%xmm2\n"
852 - "psraw $0x6,%%xmm2\n"
853 - "packuswb %%xmm2,%%xmm2\n"
854 - "movq %%xmm2,0x0(%3)\n"
855 - "add $0x8,%3\n"
856 -"convertend:"
857 - "sub $0x2,%4\n"
858 - "jns convertloop\n"
860 -"convertnext:"
861 - "add $0x1,%4\n"
862 - "js convertdone\n"
864 - "movzb (%1),%%r10\n"
865 - "movq 2048(%5,%%r10,8),%%xmm0\n"
866 - "movzb (%2),%%r10\n"
867 - "movq 4096(%5,%%r10,8),%%xmm1\n"
868 - "paddsw %%xmm1,%%xmm0\n"
869 - "movzb (%0),%%r10\n"
870 - "movq (%5,%%r10,8),%%xmm1\n"
871 - "paddsw %%xmm0,%%xmm1\n"
872 - "psraw $0x6,%%xmm1\n"
873 - "packuswb %%xmm1,%%xmm1\n"
874 - "movd %%xmm1,0x0(%3)\n"
875 -"convertdone:"
877 - : "r"(y_buf), // %0
878 - "r"(u_buf), // %1
879 - "r"(v_buf), // %2
880 - "r"(rgb_buf), // %3
881 - "r"(width), // %4
882 - "r" (kCoefficientsRgbY) // %5
883 - : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
887 -void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
888 - const uint8* u_buf, // rsi
889 - const uint8* v_buf, // rdx
890 - uint8* rgb_buf, // rcx
891 - int width, // r8
892 - int source_dx) { // r9
893 - asm(
894 - "xor %%r11,%%r11\n"
895 - "sub $0x2,%4\n"
896 - "js scalenext\n"
898 -"scaleloop:"
899 - "mov %%r11,%%r10\n"
900 - "sar $0x11,%%r10\n"
901 - "movzb (%1,%%r10,1),%%rax\n"
902 - "movq 2048(%5,%%rax,8),%%xmm0\n"
903 - "movzb (%2,%%r10,1),%%rax\n"
904 - "movq 4096(%5,%%rax,8),%%xmm1\n"
905 - "lea (%%r11,%6),%%r10\n"
906 - "sar $0x10,%%r11\n"
907 - "movzb (%0,%%r11,1),%%rax\n"
908 - "paddsw %%xmm1,%%xmm0\n"
909 - "movq (%5,%%rax,8),%%xmm1\n"
910 - "lea (%%r10,%6),%%r11\n"
911 - "sar $0x10,%%r10\n"
912 - "movzb (%0,%%r10,1),%%rax\n"
913 - "movq (%5,%%rax,8),%%xmm2\n"
914 - "paddsw %%xmm0,%%xmm1\n"
915 - "paddsw %%xmm0,%%xmm2\n"
916 - "shufps $0x44,%%xmm2,%%xmm1\n"
917 - "psraw $0x6,%%xmm1\n"
918 - "packuswb %%xmm1,%%xmm1\n"
919 - "movq %%xmm1,0x0(%3)\n"
920 - "add $0x8,%3\n"
921 - "sub $0x2,%4\n"
922 - "jns scaleloop\n"
924 -"scalenext:"
925 - "add $0x1,%4\n"
926 - "js scaledone\n"
928 - "mov %%r11,%%r10\n"
929 - "sar $0x11,%%r10\n"
930 - "movzb (%1,%%r10,1),%%rax\n"
931 - "movq 2048(%5,%%rax,8),%%xmm0\n"
932 - "movzb (%2,%%r10,1),%%rax\n"
933 - "movq 4096(%5,%%rax,8),%%xmm1\n"
934 - "paddsw %%xmm1,%%xmm0\n"
935 - "sar $0x10,%%r11\n"
936 - "movzb (%0,%%r11,1),%%rax\n"
937 - "movq (%5,%%rax,8),%%xmm1\n"
938 - "paddsw %%xmm0,%%xmm1\n"
939 - "psraw $0x6,%%xmm1\n"
940 - "packuswb %%xmm1,%%xmm1\n"
941 - "movd %%xmm1,0x0(%3)\n"
943 -"scaledone:"
945 - : "r"(y_buf), // %0
946 - "r"(u_buf), // %1
947 - "r"(v_buf), // %2
948 - "r"(rgb_buf), // %3
949 - "r"(width), // %4
950 - "r" (kCoefficientsRgbY), // %5
951 - "r"(static_cast<long>(source_dx)) // %6
952 - : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
956 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
957 - const uint8* u_buf,
958 - const uint8* v_buf,
959 - uint8* rgb_buf,
960 - int width,
961 - int source_dx) {
962 - asm(
963 - "xor %%r11,%%r11\n" // x = 0
964 - "sub $0x2,%4\n"
965 - "js .lscalenext\n"
966 - "cmp $0x20000,%6\n" // if source_dx >= 2.0
967 - "jl .lscalehalf\n"
968 - "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
969 -".lscalehalf:"
971 -".lscaleloop:"
972 - "mov %%r11,%%r10\n"
973 - "sar $0x11,%%r10\n"
975 - "movzb (%1, %%r10, 1), %%r13 \n"
976 - "movzb 1(%1, %%r10, 1), %%r14 \n"
977 - "mov %%r11, %%rax \n"
978 - "and $0x1fffe, %%rax \n"
979 - "imul %%rax, %%r14 \n"
980 - "xor $0x1fffe, %%rax \n"
981 - "imul %%rax, %%r13 \n"
982 - "add %%r14, %%r13 \n"
983 - "shr $17, %%r13 \n"
984 - "movq 2048(%5,%%r13,8), %%xmm0\n"
986 - "movzb (%2, %%r10, 1), %%r13 \n"
987 - "movzb 1(%2, %%r10, 1), %%r14 \n"
988 - "mov %%r11, %%rax \n"
989 - "and $0x1fffe, %%rax \n"
990 - "imul %%rax, %%r14 \n"
991 - "xor $0x1fffe, %%rax \n"
992 - "imul %%rax, %%r13 \n"
993 - "add %%r14, %%r13 \n"
994 - "shr $17, %%r13 \n"
995 - "movq 4096(%5,%%r13,8), %%xmm1\n"
997 - "mov %%r11, %%rax \n"
998 - "lea (%%r11,%6),%%r10\n"
999 - "sar $0x10,%%r11\n"
1000 - "paddsw %%xmm1,%%xmm0\n"
1002 - "movzb (%0, %%r11, 1), %%r13 \n"
1003 - "movzb 1(%0, %%r11, 1), %%r14 \n"
1004 - "and $0xffff, %%rax \n"
1005 - "imul %%rax, %%r14 \n"
1006 - "xor $0xffff, %%rax \n"
1007 - "imul %%rax, %%r13 \n"
1008 - "add %%r14, %%r13 \n"
1009 - "shr $16, %%r13 \n"
1010 - "movq (%5,%%r13,8),%%xmm1\n"
1012 - "mov %%r10, %%rax \n"
1013 - "lea (%%r10,%6),%%r11\n"
1014 - "sar $0x10,%%r10\n"
1016 - "movzb (%0,%%r10,1), %%r13 \n"
1017 - "movzb 1(%0,%%r10,1), %%r14 \n"
1018 - "and $0xffff, %%rax \n"
1019 - "imul %%rax, %%r14 \n"
1020 - "xor $0xffff, %%rax \n"
1021 - "imul %%rax, %%r13 \n"
1022 - "add %%r14, %%r13 \n"
1023 - "shr $16, %%r13 \n"
1024 - "movq (%5,%%r13,8),%%xmm2\n"
1026 - "paddsw %%xmm0,%%xmm1\n"
1027 - "paddsw %%xmm0,%%xmm2\n"
1028 - "shufps $0x44,%%xmm2,%%xmm1\n"
1029 - "psraw $0x6,%%xmm1\n"
1030 - "packuswb %%xmm1,%%xmm1\n"
1031 - "movq %%xmm1,0x0(%3)\n"
1032 - "add $0x8,%3\n"
1033 - "sub $0x2,%4\n"
1034 - "jns .lscaleloop\n"
1036 -".lscalenext:"
1037 - "add $0x1,%4\n"
1038 - "js .lscaledone\n"
1040 - "mov %%r11,%%r10\n"
1041 - "sar $0x11,%%r10\n"
1043 - "movzb (%1,%%r10,1), %%r13 \n"
1044 - "movq 2048(%5,%%r13,8),%%xmm0\n"
1046 - "movzb (%2,%%r10,1), %%r13 \n"
1047 - "movq 4096(%5,%%r13,8),%%xmm1\n"
1049 - "paddsw %%xmm1,%%xmm0\n"
1050 - "sar $0x10,%%r11\n"
1052 - "movzb (%0,%%r11,1), %%r13 \n"
1053 - "movq (%5,%%r13,8),%%xmm1\n"
1055 - "paddsw %%xmm0,%%xmm1\n"
1056 - "psraw $0x6,%%xmm1\n"
1057 - "packuswb %%xmm1,%%xmm1\n"
1058 - "movd %%xmm1,0x0(%3)\n"
1060 -".lscaledone:"
1062 - : "r"(y_buf), // %0
1063 - "r"(u_buf), // %1
1064 - "r"(v_buf), // %2
1065 - "r"(rgb_buf), // %3
1066 - "r"(width), // %4
1067 - "r" (kCoefficientsRgbY), // %5
1068 - "r"(static_cast<long>(source_dx)) // %6
1069 - : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1073 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1075 -// PIC version is slower because less registers are available, so
1076 -// non-PIC is used on platforms where it is possible.
1078 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1079 - const uint8* u_buf,
1080 - const uint8* v_buf,
1081 - uint8* rgb_buf,
1082 - int width);
1083 - asm(
1084 - ".text\n"
1085 - ".global FastConvertYUVToRGB32Row\n"
1086 -"FastConvertYUVToRGB32Row:\n"
1087 - "pusha\n"
1088 - "mov 0x24(%esp),%edx\n"
1089 - "mov 0x28(%esp),%edi\n"
1090 - "mov 0x2c(%esp),%esi\n"
1091 - "mov 0x30(%esp),%ebp\n"
1092 - "mov 0x34(%esp),%ecx\n"
1093 - "jmp convertend\n"
1095 -"convertloop:"
1096 - "movzbl (%edi),%eax\n"
1097 - "add $0x1,%edi\n"
1098 - "movzbl (%esi),%ebx\n"
1099 - "add $0x1,%esi\n"
1100 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1101 - "movzbl (%edx),%eax\n"
1102 - "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
1103 - "movzbl 0x1(%edx),%ebx\n"
1104 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1105 - "add $0x2,%edx\n"
1106 - "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
1107 - "paddsw %mm0,%mm1\n"
1108 - "paddsw %mm0,%mm2\n"
1109 - "psraw $0x6,%mm1\n"
1110 - "psraw $0x6,%mm2\n"
1111 - "packuswb %mm2,%mm1\n"
1112 - "movntq %mm1,0x0(%ebp)\n"
1113 - "add $0x8,%ebp\n"
1114 -"convertend:"
1115 - "sub $0x2,%ecx\n"
1116 - "jns convertloop\n"
1118 - "and $0x1,%ecx\n"
1119 - "je convertdone\n"
1121 - "movzbl (%edi),%eax\n"
1122 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1123 - "movzbl (%esi),%eax\n"
1124 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1125 - "movzbl (%edx),%eax\n"
1126 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1127 - "paddsw %mm0,%mm1\n"
1128 - "psraw $0x6,%mm1\n"
1129 - "packuswb %mm1,%mm1\n"
1130 - "movd %mm1,0x0(%ebp)\n"
1131 -"convertdone:"
1132 - "popa\n"
1133 - "ret\n"
1137 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1138 - const uint8* u_buf,
1139 - const uint8* v_buf,
1140 - uint8* rgb_buf,
1141 - int width,
1142 - int source_dx);
1143 - asm(
1144 - ".text\n"
1145 - ".global ScaleYUVToRGB32Row\n"
1146 -"ScaleYUVToRGB32Row:\n"
1147 - "pusha\n"
1148 - "mov 0x24(%esp),%edx\n"
1149 - "mov 0x28(%esp),%edi\n"
1150 - "mov 0x2c(%esp),%esi\n"
1151 - "mov 0x30(%esp),%ebp\n"
1152 - "mov 0x34(%esp),%ecx\n"
1153 - "xor %ebx,%ebx\n"
1154 - "jmp scaleend\n"
1156 -"scaleloop:"
1157 - "mov %ebx,%eax\n"
1158 - "sar $0x11,%eax\n"
1159 - "movzbl (%edi,%eax,1),%eax\n"
1160 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1161 - "mov %ebx,%eax\n"
1162 - "sar $0x11,%eax\n"
1163 - "movzbl (%esi,%eax,1),%eax\n"
1164 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1165 - "mov %ebx,%eax\n"
1166 - "add 0x38(%esp),%ebx\n"
1167 - "sar $0x10,%eax\n"
1168 - "movzbl (%edx,%eax,1),%eax\n"
1169 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1170 - "mov %ebx,%eax\n"
1171 - "add 0x38(%esp),%ebx\n"
1172 - "sar $0x10,%eax\n"
1173 - "movzbl (%edx,%eax,1),%eax\n"
1174 - "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
1175 - "paddsw %mm0,%mm1\n"
1176 - "paddsw %mm0,%mm2\n"
1177 - "psraw $0x6,%mm1\n"
1178 - "psraw $0x6,%mm2\n"
1179 - "packuswb %mm2,%mm1\n"
1180 - "movntq %mm1,0x0(%ebp)\n"
1181 - "add $0x8,%ebp\n"
1182 -"scaleend:"
1183 - "sub $0x2,%ecx\n"
1184 - "jns scaleloop\n"
1186 - "and $0x1,%ecx\n"
1187 - "je scaledone\n"
1189 - "mov %ebx,%eax\n"
1190 - "sar $0x11,%eax\n"
1191 - "movzbl (%edi,%eax,1),%eax\n"
1192 - "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
1193 - "mov %ebx,%eax\n"
1194 - "sar $0x11,%eax\n"
1195 - "movzbl (%esi,%eax,1),%eax\n"
1196 - "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
1197 - "mov %ebx,%eax\n"
1198 - "sar $0x10,%eax\n"
1199 - "movzbl (%edx,%eax,1),%eax\n"
1200 - "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
1201 - "paddsw %mm0,%mm1\n"
1202 - "psraw $0x6,%mm1\n"
1203 - "packuswb %mm1,%mm1\n"
1204 - "movd %mm1,0x0(%ebp)\n"
1206 -"scaledone:"
1207 - "popa\n"
1208 - "ret\n"
1211 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1212 - const uint8* u_buf,
1213 - const uint8* v_buf,
1214 - uint8* rgb_buf,
1215 - int width,
1216 - int source_dx);
1217 - asm(
1218 - ".text\n"
1219 - ".global LinearScaleYUVToRGB32Row\n"
1220 -"LinearScaleYUVToRGB32Row:\n"
1221 - "pusha\n"
1222 - "mov 0x24(%esp),%edx\n"
1223 - "mov 0x28(%esp),%edi\n"
1224 - "mov 0x30(%esp),%ebp\n"
1226 - // source_width = width * source_dx + ebx
1227 - "mov 0x34(%esp), %ecx\n"
1228 - "imull 0x38(%esp), %ecx\n"
1229 - "mov %ecx, 0x34(%esp)\n"
1231 - "mov 0x38(%esp), %ecx\n"
1232 - "xor %ebx,%ebx\n" // x = 0
1233 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1234 - "jl .lscaleend\n"
1235 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1236 - "jmp .lscaleend\n"
1238 -".lscaleloop:"
1239 - "mov %ebx,%eax\n"
1240 - "sar $0x11,%eax\n"
1242 - "movzbl (%edi,%eax,1),%ecx\n"
1243 - "movzbl 1(%edi,%eax,1),%esi\n"
1244 - "mov %ebx,%eax\n"
1245 - "andl $0x1fffe, %eax \n"
1246 - "imul %eax, %esi \n"
1247 - "xorl $0x1fffe, %eax \n"
1248 - "imul %eax, %ecx \n"
1249 - "addl %esi, %ecx \n"
1250 - "shrl $17, %ecx \n"
1251 - "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
1253 - "mov 0x2c(%esp),%esi\n"
1254 - "mov %ebx,%eax\n"
1255 - "sar $0x11,%eax\n"
1257 - "movzbl (%esi,%eax,1),%ecx\n"
1258 - "movzbl 1(%esi,%eax,1),%esi\n"
1259 - "mov %ebx,%eax\n"
1260 - "andl $0x1fffe, %eax \n"
1261 - "imul %eax, %esi \n"
1262 - "xorl $0x1fffe, %eax \n"
1263 - "imul %eax, %ecx \n"
1264 - "addl %esi, %ecx \n"
1265 - "shrl $17, %ecx \n"
1266 - "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
1268 - "mov %ebx,%eax\n"
1269 - "sar $0x10,%eax\n"
1270 - "movzbl (%edx,%eax,1),%ecx\n"
1271 - "movzbl 1(%edx,%eax,1),%esi\n"
1272 - "mov %ebx,%eax\n"
1273 - "add 0x38(%esp),%ebx\n"
1274 - "andl $0xffff, %eax \n"
1275 - "imul %eax, %esi \n"
1276 - "xorl $0xffff, %eax \n"
1277 - "imul %eax, %ecx \n"
1278 - "addl %esi, %ecx \n"
1279 - "shrl $16, %ecx \n"
1280 - "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
1282 - "cmp 0x34(%esp), %ebx\n"
1283 - "jge .lscalelastpixel\n"
1285 - "mov %ebx,%eax\n"
1286 - "sar $0x10,%eax\n"
1287 - "movzbl (%edx,%eax,1),%ecx\n"
1288 - "movzbl 1(%edx,%eax,1),%esi\n"
1289 - "mov %ebx,%eax\n"
1290 - "add 0x38(%esp),%ebx\n"
1291 - "andl $0xffff, %eax \n"
1292 - "imul %eax, %esi \n"
1293 - "xorl $0xffff, %eax \n"
1294 - "imul %eax, %ecx \n"
1295 - "addl %esi, %ecx \n"
1296 - "shrl $16, %ecx \n"
1297 - "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
1299 - "paddsw %mm0,%mm1\n"
1300 - "paddsw %mm0,%mm2\n"
1301 - "psraw $0x6,%mm1\n"
1302 - "psraw $0x6,%mm2\n"
1303 - "packuswb %mm2,%mm1\n"
1304 - "movntq %mm1,0x0(%ebp)\n"
1305 - "add $0x8,%ebp\n"
1307 -".lscaleend:"
1308 - "cmp 0x34(%esp), %ebx\n"
1309 - "jl .lscaleloop\n"
1310 - "popa\n"
1311 - "ret\n"
1313 -".lscalelastpixel:"
1314 - "paddsw %mm0, %mm1\n"
1315 - "psraw $6, %mm1\n"
1316 - "packuswb %mm1, %mm1\n"
1317 - "movd %mm1, (%ebp)\n"
1318 - "popa\n"
1319 - "ret\n"
1322 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
1324 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
1325 - const uint8* u_buf,
1326 - const uint8* v_buf,
1327 - uint8* rgb_buf,
1328 - int width,
1329 - int16 *kCoefficientsRgbY);
1330 - asm(
1331 - ".text\n"
1332 -#if defined(OS_MACOSX)
1333 -"_PICConvertYUVToRGB32Row:\n"
1334 -#else
1335 -"PICConvertYUVToRGB32Row:\n"
1336 -#endif
1337 - "pusha\n"
1338 - "mov 0x24(%esp),%edx\n"
1339 - "mov 0x28(%esp),%edi\n"
1340 - "mov 0x2c(%esp),%esi\n"
1341 - "mov 0x30(%esp),%ebp\n"
1342 - "mov 0x38(%esp),%ecx\n"
1344 - "jmp .Lconvertend\n"
1346 -".Lconvertloop:"
1347 - "movzbl (%edi),%eax\n"
1348 - "add $0x1,%edi\n"
1349 - "movzbl (%esi),%ebx\n"
1350 - "add $0x1,%esi\n"
1351 - "movq 2048(%ecx,%eax,8),%mm0\n"
1352 - "movzbl (%edx),%eax\n"
1353 - "paddsw 4096(%ecx,%ebx,8),%mm0\n"
1354 - "movzbl 0x1(%edx),%ebx\n"
1355 - "movq 0(%ecx,%eax,8),%mm1\n"
1356 - "add $0x2,%edx\n"
1357 - "movq 0(%ecx,%ebx,8),%mm2\n"
1358 - "paddsw %mm0,%mm1\n"
1359 - "paddsw %mm0,%mm2\n"
1360 - "psraw $0x6,%mm1\n"
1361 - "psraw $0x6,%mm2\n"
1362 - "packuswb %mm2,%mm1\n"
1363 - "movntq %mm1,0x0(%ebp)\n"
1364 - "add $0x8,%ebp\n"
1365 -".Lconvertend:"
1366 - "subl $0x2,0x34(%esp)\n"
1367 - "jns .Lconvertloop\n"
1369 - "andl $0x1,0x34(%esp)\n"
1370 - "je .Lconvertdone\n"
1372 - "movzbl (%edi),%eax\n"
1373 - "movq 2048(%ecx,%eax,8),%mm0\n"
1374 - "movzbl (%esi),%eax\n"
1375 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1376 - "movzbl (%edx),%eax\n"
1377 - "movq 0(%ecx,%eax,8),%mm1\n"
1378 - "paddsw %mm0,%mm1\n"
1379 - "psraw $0x6,%mm1\n"
1380 - "packuswb %mm1,%mm1\n"
1381 - "movd %mm1,0x0(%ebp)\n"
1382 -".Lconvertdone:\n"
1383 - "popa\n"
1384 - "ret\n"
1387 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1388 - const uint8* u_buf,
1389 - const uint8* v_buf,
1390 - uint8* rgb_buf,
1391 - int width) {
1392 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
1393 - &kCoefficientsRgbY[0][0]);
1396 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
1397 - const uint8* u_buf,
1398 - const uint8* v_buf,
1399 - uint8* rgb_buf,
1400 - int width,
1401 - int source_dx,
1402 - int16 *kCoefficientsRgbY);
1404 - asm(
1405 - ".text\n"
1406 -#if defined(OS_MACOSX)
1407 -"_PICScaleYUVToRGB32Row:\n"
1408 -#else
1409 -"PICScaleYUVToRGB32Row:\n"
1410 -#endif
1411 - "pusha\n"
1412 - "mov 0x24(%esp),%edx\n"
1413 - "mov 0x28(%esp),%edi\n"
1414 - "mov 0x2c(%esp),%esi\n"
1415 - "mov 0x30(%esp),%ebp\n"
1416 - "mov 0x3c(%esp),%ecx\n"
1417 - "xor %ebx,%ebx\n"
1418 - "jmp Lscaleend\n"
1420 -"Lscaleloop:"
1421 - "mov %ebx,%eax\n"
1422 - "sar $0x11,%eax\n"
1423 - "movzbl (%edi,%eax,1),%eax\n"
1424 - "movq 2048(%ecx,%eax,8),%mm0\n"
1425 - "mov %ebx,%eax\n"
1426 - "sar $0x11,%eax\n"
1427 - "movzbl (%esi,%eax,1),%eax\n"
1428 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1429 - "mov %ebx,%eax\n"
1430 - "add 0x38(%esp),%ebx\n"
1431 - "sar $0x10,%eax\n"
1432 - "movzbl (%edx,%eax,1),%eax\n"
1433 - "movq 0(%ecx,%eax,8),%mm1\n"
1434 - "mov %ebx,%eax\n"
1435 - "add 0x38(%esp),%ebx\n"
1436 - "sar $0x10,%eax\n"
1437 - "movzbl (%edx,%eax,1),%eax\n"
1438 - "movq 0(%ecx,%eax,8),%mm2\n"
1439 - "paddsw %mm0,%mm1\n"
1440 - "paddsw %mm0,%mm2\n"
1441 - "psraw $0x6,%mm1\n"
1442 - "psraw $0x6,%mm2\n"
1443 - "packuswb %mm2,%mm1\n"
1444 - "movntq %mm1,0x0(%ebp)\n"
1445 - "add $0x8,%ebp\n"
1446 -"Lscaleend:"
1447 - "subl $0x2,0x34(%esp)\n"
1448 - "jns Lscaleloop\n"
1450 - "andl $0x1,0x34(%esp)\n"
1451 - "je Lscaledone\n"
1453 - "mov %ebx,%eax\n"
1454 - "sar $0x11,%eax\n"
1455 - "movzbl (%edi,%eax,1),%eax\n"
1456 - "movq 2048(%ecx,%eax,8),%mm0\n"
1457 - "mov %ebx,%eax\n"
1458 - "sar $0x11,%eax\n"
1459 - "movzbl (%esi,%eax,1),%eax\n"
1460 - "paddsw 4096(%ecx,%eax,8),%mm0\n"
1461 - "mov %ebx,%eax\n"
1462 - "sar $0x10,%eax\n"
1463 - "movzbl (%edx,%eax,1),%eax\n"
1464 - "movq 0(%ecx,%eax,8),%mm1\n"
1465 - "paddsw %mm0,%mm1\n"
1466 - "psraw $0x6,%mm1\n"
1467 - "packuswb %mm1,%mm1\n"
1468 - "movd %mm1,0x0(%ebp)\n"
1470 -"Lscaledone:"
1471 - "popa\n"
1472 - "ret\n"
1476 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1477 - const uint8* u_buf,
1478 - const uint8* v_buf,
1479 - uint8* rgb_buf,
1480 - int width,
1481 - int source_dx) {
1482 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1483 - &kCoefficientsRgbY[0][0]);
1486 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
1487 - const uint8* u_buf,
1488 - const uint8* v_buf,
1489 - uint8* rgb_buf,
1490 - int width,
1491 - int source_dx,
1492 - int16 *kCoefficientsRgbY);
1493 - asm(
1494 - ".text\n"
1495 -#if defined(OS_MACOSX)
1496 -"_PICLinearScaleYUVToRGB32Row:\n"
1497 -#else
1498 -"PICLinearScaleYUVToRGB32Row:\n"
1499 -#endif
1500 - "pusha\n"
1501 - "mov 0x24(%esp),%edx\n"
1502 - "mov 0x30(%esp),%ebp\n"
1503 - "mov 0x34(%esp),%ecx\n"
1504 - "mov 0x3c(%esp),%edi\n"
1505 - "xor %ebx,%ebx\n"
1507 - // source_width = width * source_dx + ebx
1508 - "mov 0x34(%esp), %ecx\n"
1509 - "imull 0x38(%esp), %ecx\n"
1510 - "mov %ecx, 0x34(%esp)\n"
1512 - "mov 0x38(%esp), %ecx\n"
1513 - "xor %ebx,%ebx\n" // x = 0
1514 - "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
1515 - "jl .lscaleend\n"
1516 - "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
1517 - "jmp .lscaleend\n"
1519 -".lscaleloop:"
1520 - "mov 0x28(%esp),%esi\n"
1521 - "mov %ebx,%eax\n"
1522 - "sar $0x11,%eax\n"
1524 - "movzbl (%esi,%eax,1),%ecx\n"
1525 - "movzbl 1(%esi,%eax,1),%esi\n"
1526 - "mov %ebx,%eax\n"
1527 - "andl $0x1fffe, %eax \n"
1528 - "imul %eax, %esi \n"
1529 - "xorl $0x1fffe, %eax \n"
1530 - "imul %eax, %ecx \n"
1531 - "addl %esi, %ecx \n"
1532 - "shrl $17, %ecx \n"
1533 - "movq 2048(%edi,%ecx,8),%mm0\n"
1535 - "mov 0x2c(%esp),%esi\n"
1536 - "mov %ebx,%eax\n"
1537 - "sar $0x11,%eax\n"
1539 - "movzbl (%esi,%eax,1),%ecx\n"
1540 - "movzbl 1(%esi,%eax,1),%esi\n"
1541 - "mov %ebx,%eax\n"
1542 - "andl $0x1fffe, %eax \n"
1543 - "imul %eax, %esi \n"
1544 - "xorl $0x1fffe, %eax \n"
1545 - "imul %eax, %ecx \n"
1546 - "addl %esi, %ecx \n"
1547 - "shrl $17, %ecx \n"
1548 - "paddsw 4096(%edi,%ecx,8),%mm0\n"
1550 - "mov %ebx,%eax\n"
1551 - "sar $0x10,%eax\n"
1552 - "movzbl (%edx,%eax,1),%ecx\n"
1553 - "movzbl 1(%edx,%eax,1),%esi\n"
1554 - "mov %ebx,%eax\n"
1555 - "add 0x38(%esp),%ebx\n"
1556 - "andl $0xffff, %eax \n"
1557 - "imul %eax, %esi \n"
1558 - "xorl $0xffff, %eax \n"
1559 - "imul %eax, %ecx \n"
1560 - "addl %esi, %ecx \n"
1561 - "shrl $16, %ecx \n"
1562 - "movq (%edi,%ecx,8),%mm1\n"
1564 - "cmp 0x34(%esp), %ebx\n"
1565 - "jge .lscalelastpixel\n"
1567 - "mov %ebx,%eax\n"
1568 - "sar $0x10,%eax\n"
1569 - "movzbl (%edx,%eax,1),%ecx\n"
1570 - "movzbl 1(%edx,%eax,1),%esi\n"
1571 - "mov %ebx,%eax\n"
1572 - "add 0x38(%esp),%ebx\n"
1573 - "andl $0xffff, %eax \n"
1574 - "imul %eax, %esi \n"
1575 - "xorl $0xffff, %eax \n"
1576 - "imul %eax, %ecx \n"
1577 - "addl %esi, %ecx \n"
1578 - "shrl $16, %ecx \n"
1579 - "movq (%edi,%ecx,8),%mm2\n"
1581 - "paddsw %mm0,%mm1\n"
1582 - "paddsw %mm0,%mm2\n"
1583 - "psraw $0x6,%mm1\n"
1584 - "psraw $0x6,%mm2\n"
1585 - "packuswb %mm2,%mm1\n"
1586 - "movntq %mm1,0x0(%ebp)\n"
1587 - "add $0x8,%ebp\n"
1589 -".lscaleend:"
1590 - "cmp %ebx, 0x34(%esp)\n"
1591 - "jg .lscaleloop\n"
1592 - "popa\n"
1593 - "ret\n"
1595 -".lscalelastpixel:"
1596 - "paddsw %mm0, %mm1\n"
1597 - "psraw $6, %mm1\n"
1598 - "packuswb %mm1, %mm1\n"
1599 - "movd %mm1, (%ebp)\n"
1600 - "popa\n"
1601 - "ret\n"
1604 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1605 - const uint8* u_buf,
1606 - const uint8* v_buf,
1607 - uint8* rgb_buf,
1608 - int width,
1609 - int source_dx) {
1610 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
1611 - &kCoefficientsRgbY[0][0]);
1614 -#else // USE_MMX
1616 // C reference code that mimic the YUV assembly.
1617 #define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
1618 #define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
1619 (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
1621 static inline void YuvPixel(uint8 y,
1622 uint8 u,
1623 uint8 v,
1624 @@ -833,66 +39,71 @@ static inline void YuvPixel(uint8 y,
1625 a >>= 6;
1627 *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
1628 (packuswb(g) << 8) |
1629 (packuswb(r) << 16) |
1630 (packuswb(a) << 24);
1633 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1634 - const uint8* u_buf,
1635 - const uint8* v_buf,
1636 - uint8* rgb_buf,
1637 - int width) {
1638 +void FastConvertYUVToRGB32Row_C(const uint8* y_buf,
1639 + const uint8* u_buf,
1640 + const uint8* v_buf,
1641 + uint8* rgb_buf,
1642 + int width,
1643 + unsigned int x_shift) {
1644 for (int x = 0; x < width; x += 2) {
1645 - uint8 u = u_buf[x >> 1];
1646 - uint8 v = v_buf[x >> 1];
1647 + uint8 u = u_buf[x >> x_shift];
1648 + uint8 v = v_buf[x >> x_shift];
1649 uint8 y0 = y_buf[x];
1650 YuvPixel(y0, u, v, rgb_buf);
1651 if ((x + 1) < width) {
1652 uint8 y1 = y_buf[x + 1];
1653 + if (x_shift == 0) {
1654 + u = u_buf[x + 1];
1655 + v = v_buf[x + 1];
1657 YuvPixel(y1, u, v, rgb_buf + 4);
1659 rgb_buf += 8; // Advance 2 pixels.
1663 // 16.16 fixed point is used. A shift by 16 isolates the integer.
1664 // A shift by 17 is used to further subsample the chrominence channels.
1665 // & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
1666 // for 1/65536 pixel accurate interpolation.
1667 -void ScaleYUVToRGB32Row(const uint8* y_buf,
1668 - const uint8* u_buf,
1669 - const uint8* v_buf,
1670 - uint8* rgb_buf,
1671 - int width,
1672 - int source_dx) {
1673 +void ScaleYUVToRGB32Row_C(const uint8* y_buf,
1674 + const uint8* u_buf,
1675 + const uint8* v_buf,
1676 + uint8* rgb_buf,
1677 + int width,
1678 + int source_dx) {
1679 int x = 0;
1680 for (int i = 0; i < width; i += 2) {
1681 int y = y_buf[x >> 16];
1682 int u = u_buf[(x >> 17)];
1683 int v = v_buf[(x >> 17)];
1684 YuvPixel(y, u, v, rgb_buf);
1685 x += source_dx;
1686 if ((i + 1) < width) {
1687 y = y_buf[x >> 16];
1688 YuvPixel(y, u, v, rgb_buf+4);
1689 x += source_dx;
1691 rgb_buf += 8;
1695 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
1696 - const uint8* u_buf,
1697 - const uint8* v_buf,
1698 - uint8* rgb_buf,
1699 - int width,
1700 - int source_dx) {
1701 +void LinearScaleYUVToRGB32Row_C(const uint8* y_buf,
1702 + const uint8* u_buf,
1703 + const uint8* v_buf,
1704 + uint8* rgb_buf,
1705 + int width,
1706 + int source_dx) {
1707 int x = 0;
1708 if (source_dx >= 0x20000) {
1709 x = 32768;
1711 for (int i = 0; i < width; i += 2) {
1712 int y0 = y_buf[x >> 16];
1713 int y1 = y_buf[(x >> 16) + 1];
1714 int u0 = u_buf[(x >> 17)];
1715 @@ -913,11 +124,10 @@ void LinearScaleYUVToRGB32Row(const uint
1716 y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
1717 YuvPixel(y, u, v, rgb_buf+4);
1718 x += source_dx;
1720 rgb_buf += 8;
1724 -#endif // USE_MMX
1725 } // extern "C"
1727 diff --git a/gfx/ycbcr/yuv_row_posix.cpp b/gfx/ycbcr/yuv_row_posix.cpp
1728 --- a/gfx/ycbcr/yuv_row_posix.cpp
1729 +++ b/gfx/ycbcr/yuv_row_posix.cpp
1730 @@ -1,33 +1,32 @@
1731 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
1732 // Use of this source code is governed by a BSD-style license that can be
1733 // found in the LICENSE file.
1735 -#include "media/base/yuv_row.h"
1737 -#ifdef _DEBUG
1738 -#include "base/logging.h"
1739 -#else
1740 +#include "yuv_row.h"
1741 +#include "mozilla/SSE.h"
1743 #define DCHECK(a)
1744 -#endif
1746 extern "C" {
1748 -#if USE_SSE2 && defined(ARCH_CPU_X86_64)
1749 +#if defined(ARCH_CPU_X86_64)
1751 +// We don't need CPUID guards here, since x86-64 implies SSE2.
1753 // AMD64 ABI uses register paremters.
1754 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
1755 const uint8* u_buf, // rsi
1756 const uint8* v_buf, // rdx
1757 uint8* rgb_buf, // rcx
1758 int width) { // r8
1759 asm(
1760 - "jmp convertend\n"
1761 -"convertloop:"
1762 + "jmp 1f\n"
1763 +"0:"
1764 "movzb (%1),%%r10\n"
1765 "add $0x1,%1\n"
1766 "movzb (%2),%%r11\n"
1767 "add $0x1,%2\n"
1768 "movq 2048(%5,%%r10,8),%%xmm0\n"
1769 "movzb (%0),%%r10\n"
1770 "movq 4096(%5,%%r11,8),%%xmm1\n"
1771 "movzb 0x1(%0),%%r11\n"
1772 @@ -37,36 +36,36 @@ void FastConvertYUVToRGB32Row(const uint
1773 "movq (%5,%%r11,8),%%xmm3\n"
1774 "paddsw %%xmm0,%%xmm2\n"
1775 "paddsw %%xmm0,%%xmm3\n"
1776 "shufps $0x44,%%xmm3,%%xmm2\n"
1777 "psraw $0x6,%%xmm2\n"
1778 "packuswb %%xmm2,%%xmm2\n"
1779 "movq %%xmm2,0x0(%3)\n"
1780 "add $0x8,%3\n"
1781 -"convertend:"
1782 +"1:"
1783 "sub $0x2,%4\n"
1784 - "jns convertloop\n"
1786 -"convertnext:"
1787 + "jns 0b\n"
1789 +"2:"
1790 "add $0x1,%4\n"
1791 - "js convertdone\n"
1792 + "js 3f\n"
1794 "movzb (%1),%%r10\n"
1795 "movq 2048(%5,%%r10,8),%%xmm0\n"
1796 "movzb (%2),%%r10\n"
1797 "movq 4096(%5,%%r10,8),%%xmm1\n"
1798 "paddsw %%xmm1,%%xmm0\n"
1799 "movzb (%0),%%r10\n"
1800 "movq (%5,%%r10,8),%%xmm1\n"
1801 "paddsw %%xmm0,%%xmm1\n"
1802 "psraw $0x6,%%xmm1\n"
1803 "packuswb %%xmm1,%%xmm1\n"
1804 "movd %%xmm1,0x0(%3)\n"
1805 -"convertdone:"
1806 +"3:"
1808 : "r"(y_buf), // %0
1809 "r"(u_buf), // %1
1810 "r"(v_buf), // %2
1811 "r"(rgb_buf), // %3
1812 "r"(width), // %4
1813 "r" (kCoefficientsRgbY) // %5
1814 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
1815 @@ -77,19 +76,19 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1816 const uint8* u_buf, // rsi
1817 const uint8* v_buf, // rdx
1818 uint8* rgb_buf, // rcx
1819 int width, // r8
1820 int source_dx) { // r9
1821 asm(
1822 "xor %%r11,%%r11\n"
1823 "sub $0x2,%4\n"
1824 - "js scalenext\n"
1826 -"scaleloop:"
1827 + "js 1f\n"
1829 +"0:"
1830 "mov %%r11,%%r10\n"
1831 "sar $0x11,%%r10\n"
1832 "movzb (%1,%%r10,1),%%rax\n"
1833 "movq 2048(%5,%%rax,8),%%xmm0\n"
1834 "movzb (%2,%%r10,1),%%rax\n"
1835 "movq 4096(%5,%%rax,8),%%xmm1\n"
1836 "lea (%%r11,%6),%%r10\n"
1837 "sar $0x10,%%r11\n"
1838 @@ -103,38 +102,38 @@ void ScaleYUVToRGB32Row(const uint8* y_b
1839 "paddsw %%xmm0,%%xmm1\n"
1840 "paddsw %%xmm0,%%xmm2\n"
1841 "shufps $0x44,%%xmm2,%%xmm1\n"
1842 "psraw $0x6,%%xmm1\n"
1843 "packuswb %%xmm1,%%xmm1\n"
1844 "movq %%xmm1,0x0(%3)\n"
1845 "add $0x8,%3\n"
1846 "sub $0x2,%4\n"
1847 - "jns scaleloop\n"
1849 -"scalenext:"
1850 + "jns 0b\n"
1852 +"1:"
1853 "add $0x1,%4\n"
1854 - "js scaledone\n"
1855 + "js 2f\n"
1857 "mov %%r11,%%r10\n"
1858 "sar $0x11,%%r10\n"
1859 "movzb (%1,%%r10,1),%%rax\n"
1860 "movq 2048(%5,%%rax,8),%%xmm0\n"
1861 "movzb (%2,%%r10,1),%%rax\n"
1862 "movq 4096(%5,%%rax,8),%%xmm1\n"
1863 "paddsw %%xmm1,%%xmm0\n"
1864 "sar $0x10,%%r11\n"
1865 "movzb (%0,%%r11,1),%%rax\n"
1866 "movq (%5,%%rax,8),%%xmm1\n"
1867 "paddsw %%xmm0,%%xmm1\n"
1868 "psraw $0x6,%%xmm1\n"
1869 "packuswb %%xmm1,%%xmm1\n"
1870 "movd %%xmm1,0x0(%3)\n"
1872 -"scaledone:"
1873 +"2:"
1875 : "r"(y_buf), // %0
1876 "r"(u_buf), // %1
1877 "r"(v_buf), // %2
1878 "r"(rgb_buf), // %3
1879 "r"(width), // %4
1880 "r" (kCoefficientsRgbY), // %5
1881 "r"(static_cast<long>(source_dx)) // %6
1882 @@ -146,23 +145,23 @@ void LinearScaleYUVToRGB32Row(const uint
1883 const uint8* u_buf,
1884 const uint8* v_buf,
1885 uint8* rgb_buf,
1886 int width,
1887 int source_dx) {
1888 asm(
1889 "xor %%r11,%%r11\n" // x = 0
1890 "sub $0x2,%4\n"
1891 - "js .lscalenext\n"
1892 + "js 2f\n"
1893 "cmp $0x20000,%6\n" // if source_dx >= 2.0
1894 - "jl .lscalehalf\n"
1895 + "jl 0f\n"
1896 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
1897 -".lscalehalf:"
1899 -".lscaleloop:"
1900 +"0:"
1902 +"1:"
1903 "mov %%r11,%%r10\n"
1904 "sar $0x11,%%r10\n"
1906 "movzb (%1, %%r10, 1), %%r13 \n"
1907 "movzb 1(%1, %%r10, 1), %%r14 \n"
1908 "mov %%r11, %%rax \n"
1909 "and $0x1fffe, %%rax \n"
1910 "imul %%rax, %%r14 \n"
1911 @@ -215,21 +214,21 @@ void LinearScaleYUVToRGB32Row(const uint
1912 "paddsw %%xmm0,%%xmm1\n"
1913 "paddsw %%xmm0,%%xmm2\n"
1914 "shufps $0x44,%%xmm2,%%xmm1\n"
1915 "psraw $0x6,%%xmm1\n"
1916 "packuswb %%xmm1,%%xmm1\n"
1917 "movq %%xmm1,0x0(%3)\n"
1918 "add $0x8,%3\n"
1919 "sub $0x2,%4\n"
1920 - "jns .lscaleloop\n"
1922 -".lscalenext:"
1923 + "jns 1b\n"
1925 +"2:"
1926 "add $0x1,%4\n"
1927 - "js .lscaledone\n"
1928 + "js 3f\n"
1930 "mov %%r11,%%r10\n"
1931 "sar $0x11,%%r10\n"
1933 "movzb (%1,%%r10,1), %%r13 \n"
1934 "movq 2048(%5,%%r13,8),%%xmm0\n"
1936 "movzb (%2,%%r10,1), %%r13 \n"
1937 @@ -241,52 +240,52 @@ void LinearScaleYUVToRGB32Row(const uint
1938 "movzb (%0,%%r11,1), %%r13 \n"
1939 "movq (%5,%%r13,8),%%xmm1\n"
1941 "paddsw %%xmm0,%%xmm1\n"
1942 "psraw $0x6,%%xmm1\n"
1943 "packuswb %%xmm1,%%xmm1\n"
1944 "movd %%xmm1,0x0(%3)\n"
1946 -".lscaledone:"
1947 +"3:"
1949 : "r"(y_buf), // %0
1950 "r"(u_buf), // %1
1951 "r"(v_buf), // %2
1952 "r"(rgb_buf), // %3
1953 "r"(width), // %4
1954 "r" (kCoefficientsRgbY), // %5
1955 "r"(static_cast<long>(source_dx)) // %6
1956 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
1960 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && !defined(__PIC__)
1961 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
1963 // PIC version is slower because less registers are available, so
1964 // non-PIC is used on platforms where it is possible.
1966 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
1967 - const uint8* u_buf,
1968 - const uint8* v_buf,
1969 - uint8* rgb_buf,
1970 - int width);
1971 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
1972 + const uint8* u_buf,
1973 + const uint8* v_buf,
1974 + uint8* rgb_buf,
1975 + int width);
1976 asm(
1977 ".text\n"
1978 - ".global FastConvertYUVToRGB32Row\n"
1979 -"FastConvertYUVToRGB32Row:\n"
1980 + ".global FastConvertYUVToRGB32Row_SSE\n"
1981 + ".type FastConvertYUVToRGB32Row_SSE, @function\n"
1982 +"FastConvertYUVToRGB32Row_SSE:\n"
1983 "pusha\n"
1984 "mov 0x24(%esp),%edx\n"
1985 "mov 0x28(%esp),%edi\n"
1986 "mov 0x2c(%esp),%esi\n"
1987 "mov 0x30(%esp),%ebp\n"
1988 "mov 0x34(%esp),%ecx\n"
1989 - "jmp convertend\n"
1991 -"convertloop:"
1992 + "jmp 1f\n"
1994 +"0:"
1995 "movzbl (%edi),%eax\n"
1996 "add $0x1,%edi\n"
1997 "movzbl (%esi),%ebx\n"
1998 "add $0x1,%esi\n"
1999 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2000 "movzbl (%edx),%eax\n"
2001 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
2002 "movzbl 0x1(%edx),%ebx\n"
2003 @@ -295,59 +294,77 @@ void FastConvertYUVToRGB32Row(const uint
2004 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
2005 "paddsw %mm0,%mm1\n"
2006 "paddsw %mm0,%mm2\n"
2007 "psraw $0x6,%mm1\n"
2008 "psraw $0x6,%mm2\n"
2009 "packuswb %mm2,%mm1\n"
2010 "movntq %mm1,0x0(%ebp)\n"
2011 "add $0x8,%ebp\n"
2012 -"convertend:"
2013 +"1:"
2014 "sub $0x2,%ecx\n"
2015 - "jns convertloop\n"
2016 + "jns 0b\n"
2018 "and $0x1,%ecx\n"
2019 - "je convertdone\n"
2020 + "je 2f\n"
2022 "movzbl (%edi),%eax\n"
2023 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2024 "movzbl (%esi),%eax\n"
2025 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2026 "movzbl (%edx),%eax\n"
2027 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2028 "paddsw %mm0,%mm1\n"
2029 "psraw $0x6,%mm1\n"
2030 "packuswb %mm1,%mm1\n"
2031 "movd %mm1,0x0(%ebp)\n"
2032 -"convertdone:"
2033 +"2:"
2034 "popa\n"
2035 "ret\n"
2036 +#if !defined(XP_MACOSX)
2037 + ".previous\n"
2038 +#endif
2042 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2043 - const uint8* u_buf,
2044 - const uint8* v_buf,
2045 - uint8* rgb_buf,
2046 - int width,
2047 - int source_dx);
2048 +void FastConvertYUVToRGB32Row(const uint8* y_buf,
2049 + const uint8* u_buf,
2050 + const uint8* v_buf,
2051 + uint8* rgb_buf,
2052 + int width)
2054 + if (mozilla::supports_sse()) {
2055 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
2056 + return;
2059 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2063 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2064 + const uint8* u_buf,
2065 + const uint8* v_buf,
2066 + uint8* rgb_buf,
2067 + int width,
2068 + int source_dx);
2069 asm(
2070 ".text\n"
2071 - ".global ScaleYUVToRGB32Row\n"
2072 -"ScaleYUVToRGB32Row:\n"
2073 + ".global ScaleYUVToRGB32Row_SSE\n"
2074 + ".type ScaleYUVToRGB32Row_SSE, @function\n"
2075 +"ScaleYUVToRGB32Row_SSE:\n"
2076 "pusha\n"
2077 "mov 0x24(%esp),%edx\n"
2078 "mov 0x28(%esp),%edi\n"
2079 "mov 0x2c(%esp),%esi\n"
2080 "mov 0x30(%esp),%ebp\n"
2081 "mov 0x34(%esp),%ecx\n"
2082 "xor %ebx,%ebx\n"
2083 - "jmp scaleend\n"
2085 -"scaleloop:"
2086 + "jmp 1f\n"
2088 +"0:"
2089 "mov %ebx,%eax\n"
2090 "sar $0x11,%eax\n"
2091 "movzbl (%edi,%eax,1),%eax\n"
2092 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2093 "mov %ebx,%eax\n"
2094 "sar $0x11,%eax\n"
2095 "movzbl (%esi,%eax,1),%eax\n"
2096 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
2097 @@ -363,22 +380,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2098 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
2099 "paddsw %mm0,%mm1\n"
2100 "paddsw %mm0,%mm2\n"
2101 "psraw $0x6,%mm1\n"
2102 "psraw $0x6,%mm2\n"
2103 "packuswb %mm2,%mm1\n"
2104 "movntq %mm1,0x0(%ebp)\n"
2105 "add $0x8,%ebp\n"
2106 -"scaleend:"
2107 +"1:"
2108 "sub $0x2,%ecx\n"
2109 - "jns scaleloop\n"
2110 + "jns 0b\n"
2112 "and $0x1,%ecx\n"
2113 - "je scaledone\n"
2114 + "je 2f\n"
2116 "mov %ebx,%eax\n"
2117 "sar $0x11,%eax\n"
2118 "movzbl (%edi,%eax,1),%eax\n"
2119 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
2120 "mov %ebx,%eax\n"
2121 "sar $0x11,%eax\n"
2122 "movzbl (%esi,%eax,1),%eax\n"
2123 @@ -387,51 +404,71 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2124 "sar $0x10,%eax\n"
2125 "movzbl (%edx,%eax,1),%eax\n"
2126 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
2127 "paddsw %mm0,%mm1\n"
2128 "psraw $0x6,%mm1\n"
2129 "packuswb %mm1,%mm1\n"
2130 "movd %mm1,0x0(%ebp)\n"
2132 -"scaledone:"
2133 +"2:"
2134 "popa\n"
2135 "ret\n"
2136 +#if !defined(XP_MACOSX)
2137 + ".previous\n"
2138 +#endif
2141 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2142 - const uint8* u_buf,
2143 - const uint8* v_buf,
2144 - uint8* rgb_buf,
2145 - int width,
2146 - int source_dx);
2147 +void ScaleYUVToRGB32Row(const uint8* y_buf,
2148 + const uint8* u_buf,
2149 + const uint8* v_buf,
2150 + uint8* rgb_buf,
2151 + int width,
2152 + int source_dx)
2154 + if (mozilla::supports_sse()) {
2155 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2156 + width, source_dx);
2159 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2160 + width, source_dx);
2163 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2164 + const uint8* u_buf,
2165 + const uint8* v_buf,
2166 + uint8* rgb_buf,
2167 + int width,
2168 + int source_dx);
2169 asm(
2170 ".text\n"
2171 - ".global LinearScaleYUVToRGB32Row\n"
2172 -"LinearScaleYUVToRGB32Row:\n"
2173 + ".global LinearScaleYUVToRGB32Row_SSE\n"
2174 + ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
2175 +"LinearScaleYUVToRGB32Row_SSE:\n"
2176 "pusha\n"
2177 "mov 0x24(%esp),%edx\n"
2178 "mov 0x28(%esp),%edi\n"
2179 "mov 0x30(%esp),%ebp\n"
2181 // source_width = width * source_dx + ebx
2182 "mov 0x34(%esp), %ecx\n"
2183 "imull 0x38(%esp), %ecx\n"
2184 "mov %ecx, 0x34(%esp)\n"
2186 "mov 0x38(%esp), %ecx\n"
2187 "xor %ebx,%ebx\n" // x = 0
2188 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2189 - "jl .lscaleend\n"
2190 + "jl 1f\n"
2191 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2192 - "jmp .lscaleend\n"
2194 -".lscaleloop:"
2195 - "mov %ebx,%eax\n"
2196 - "sar $0x11,%eax\n"
2197 + "jmp 1f\n"
2199 +"0:"
2200 + "mov %ebx,%eax\n"
2201 + "sar $0x11,%eax\n"
2203 "movzbl (%edi,%eax,1),%ecx\n"
2204 "movzbl 1(%edi,%eax,1),%esi\n"
2205 "mov %ebx,%eax\n"
2206 "andl $0x1fffe, %eax \n"
2207 "imul %eax, %esi \n"
2208 "xorl $0x1fffe, %eax \n"
2209 "imul %eax, %ecx \n"
2210 @@ -464,17 +501,17 @@ void LinearScaleYUVToRGB32Row(const uint
2211 "imul %eax, %esi \n"
2212 "xorl $0xffff, %eax \n"
2213 "imul %eax, %ecx \n"
2214 "addl %esi, %ecx \n"
2215 "shrl $16, %ecx \n"
2216 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
2218 "cmp 0x34(%esp), %ebx\n"
2219 - "jge .lscalelastpixel\n"
2220 + "jge 2f\n"
2222 "mov %ebx,%eax\n"
2223 "sar $0x10,%eax\n"
2224 "movzbl (%edx,%eax,1),%ecx\n"
2225 "movzbl 1(%edx,%eax,1),%esi\n"
2226 "mov %ebx,%eax\n"
2227 "add 0x38(%esp),%ebx\n"
2228 "andl $0xffff, %eax \n"
2229 @@ -488,56 +525,76 @@ void LinearScaleYUVToRGB32Row(const uint
2230 "paddsw %mm0,%mm1\n"
2231 "paddsw %mm0,%mm2\n"
2232 "psraw $0x6,%mm1\n"
2233 "psraw $0x6,%mm2\n"
2234 "packuswb %mm2,%mm1\n"
2235 "movntq %mm1,0x0(%ebp)\n"
2236 "add $0x8,%ebp\n"
2238 -".lscaleend:"
2239 +"1:"
2240 "cmp 0x34(%esp), %ebx\n"
2241 - "jl .lscaleloop\n"
2242 + "jl 0b\n"
2243 "popa\n"
2244 "ret\n"
2246 -".lscalelastpixel:"
2247 +"2:"
2248 "paddsw %mm0, %mm1\n"
2249 "psraw $6, %mm1\n"
2250 "packuswb %mm1, %mm1\n"
2251 "movd %mm1, (%ebp)\n"
2252 "popa\n"
2253 "ret\n"
2254 +#if !defined(XP_MACOSX)
2255 + ".previous\n"
2256 +#endif
2259 -#elif USE_MMX && !defined(ARCH_CPU_X86_64) && defined(__PIC__)
2261 -extern void PICConvertYUVToRGB32Row(const uint8* y_buf,
2262 - const uint8* u_buf,
2263 - const uint8* v_buf,
2264 - uint8* rgb_buf,
2265 - int width,
2266 - int16 *kCoefficientsRgbY);
2267 +void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2268 + const uint8* u_buf,
2269 + const uint8* v_buf,
2270 + uint8* rgb_buf,
2271 + int width,
2272 + int source_dx)
2274 + if (mozilla::supports_sse()) {
2275 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
2276 + width, source_dx);
2279 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
2280 + width, source_dx);
2283 +#elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
2285 +void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2286 + const uint8* u_buf,
2287 + const uint8* v_buf,
2288 + uint8* rgb_buf,
2289 + int width,
2290 + int16 *kCoefficientsRgbY);
2292 asm(
2293 ".text\n"
2294 -#if defined(OS_MACOSX)
2295 -"_PICConvertYUVToRGB32Row:\n"
2296 +#if defined(XP_MACOSX)
2297 +"_PICConvertYUVToRGB32Row_SSE:\n"
2298 #else
2299 -"PICConvertYUVToRGB32Row:\n"
2300 +"PICConvertYUVToRGB32Row_SSE:\n"
2301 #endif
2302 "pusha\n"
2303 "mov 0x24(%esp),%edx\n"
2304 "mov 0x28(%esp),%edi\n"
2305 "mov 0x2c(%esp),%esi\n"
2306 "mov 0x30(%esp),%ebp\n"
2307 "mov 0x38(%esp),%ecx\n"
2309 - "jmp .Lconvertend\n"
2311 -".Lconvertloop:"
2312 + "jmp 1f\n"
2314 +"0:"
2315 "movzbl (%edi),%eax\n"
2316 "add $0x1,%edi\n"
2317 "movzbl (%esi),%ebx\n"
2318 "add $0x1,%esi\n"
2319 "movq 2048(%ecx,%eax,8),%mm0\n"
2320 "movzbl (%edx),%eax\n"
2321 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
2322 "movzbl 0x1(%edx),%ebx\n"
2323 @@ -546,72 +603,81 @@ extern void PICConvertYUVToRGB32Row(cons
2324 "movq 0(%ecx,%ebx,8),%mm2\n"
2325 "paddsw %mm0,%mm1\n"
2326 "paddsw %mm0,%mm2\n"
2327 "psraw $0x6,%mm1\n"
2328 "psraw $0x6,%mm2\n"
2329 "packuswb %mm2,%mm1\n"
2330 "movntq %mm1,0x0(%ebp)\n"
2331 "add $0x8,%ebp\n"
2332 -".Lconvertend:"
2333 +"1:"
2334 "subl $0x2,0x34(%esp)\n"
2335 - "jns .Lconvertloop\n"
2336 + "jns 0b\n"
2338 "andl $0x1,0x34(%esp)\n"
2339 - "je .Lconvertdone\n"
2340 + "je 2f\n"
2342 "movzbl (%edi),%eax\n"
2343 "movq 2048(%ecx,%eax,8),%mm0\n"
2344 "movzbl (%esi),%eax\n"
2345 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2346 "movzbl (%edx),%eax\n"
2347 "movq 0(%ecx,%eax,8),%mm1\n"
2348 "paddsw %mm0,%mm1\n"
2349 "psraw $0x6,%mm1\n"
2350 "packuswb %mm1,%mm1\n"
2351 "movd %mm1,0x0(%ebp)\n"
2352 -".Lconvertdone:\n"
2353 +"2:"
2354 "popa\n"
2355 "ret\n"
2356 +#if !defined(XP_MACOSX)
2357 + ".previous\n"
2358 +#endif
2361 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2362 const uint8* u_buf,
2363 const uint8* v_buf,
2364 uint8* rgb_buf,
2365 - int width) {
2366 - PICConvertYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width,
2367 - &kCoefficientsRgbY[0][0]);
2370 -extern void PICScaleYUVToRGB32Row(const uint8* y_buf,
2371 + int width)
2373 + if (mozilla::supports_sse()) {
2374 + PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2375 + &kCoefficientsRgbY[0][0]);
2376 + return;
2379 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2382 +void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2383 const uint8* u_buf,
2384 const uint8* v_buf,
2385 uint8* rgb_buf,
2386 int width,
2387 int source_dx,
2388 int16 *kCoefficientsRgbY);
2390 asm(
2391 ".text\n"
2392 -#if defined(OS_MACOSX)
2393 -"_PICScaleYUVToRGB32Row:\n"
2394 +#if defined(XP_MACOSX)
2395 +"_PICScaleYUVToRGB32Row_SSE:\n"
2396 #else
2397 -"PICScaleYUVToRGB32Row:\n"
2398 +"PICScaleYUVToRGB32Row_SSE:\n"
2399 #endif
2400 "pusha\n"
2401 "mov 0x24(%esp),%edx\n"
2402 "mov 0x28(%esp),%edi\n"
2403 "mov 0x2c(%esp),%esi\n"
2404 "mov 0x30(%esp),%ebp\n"
2405 "mov 0x3c(%esp),%ecx\n"
2406 "xor %ebx,%ebx\n"
2407 - "jmp Lscaleend\n"
2409 -"Lscaleloop:"
2410 + "jmp 1f\n"
2412 +"0:"
2413 "mov %ebx,%eax\n"
2414 "sar $0x11,%eax\n"
2415 "movzbl (%edi,%eax,1),%eax\n"
2416 "movq 2048(%ecx,%eax,8),%mm0\n"
2417 "mov %ebx,%eax\n"
2418 "sar $0x11,%eax\n"
2419 "movzbl (%esi,%eax,1),%eax\n"
2420 "paddsw 4096(%ecx,%eax,8),%mm0\n"
2421 @@ -627,22 +693,22 @@ extern void PICScaleYUVToRGB32Row(const
2422 "movq 0(%ecx,%eax,8),%mm2\n"
2423 "paddsw %mm0,%mm1\n"
2424 "paddsw %mm0,%mm2\n"
2425 "psraw $0x6,%mm1\n"
2426 "psraw $0x6,%mm2\n"
2427 "packuswb %mm2,%mm1\n"
2428 "movntq %mm1,0x0(%ebp)\n"
2429 "add $0x8,%ebp\n"
2430 -"Lscaleend:"
2431 +"1:"
2432 "subl $0x2,0x34(%esp)\n"
2433 - "jns Lscaleloop\n"
2434 + "jns 0b\n"
2436 "andl $0x1,0x34(%esp)\n"
2437 - "je Lscaledone\n"
2438 + "je 2f\n"
2440 "mov %ebx,%eax\n"
2441 "sar $0x11,%eax\n"
2442 "movzbl (%edi,%eax,1),%eax\n"
2443 "movq 2048(%ecx,%eax,8),%mm0\n"
2444 "mov %ebx,%eax\n"
2445 "sar $0x11,%eax\n"
2446 "movzbl (%esi,%eax,1),%eax\n"
2447 @@ -651,66 +717,75 @@ extern void PICScaleYUVToRGB32Row(const
2448 "sar $0x10,%eax\n"
2449 "movzbl (%edx,%eax,1),%eax\n"
2450 "movq 0(%ecx,%eax,8),%mm1\n"
2451 "paddsw %mm0,%mm1\n"
2452 "psraw $0x6,%mm1\n"
2453 "packuswb %mm1,%mm1\n"
2454 "movd %mm1,0x0(%ebp)\n"
2456 -"Lscaledone:"
2457 +"2:"
2458 "popa\n"
2459 "ret\n"
2460 +#if !defined(XP_MACOSX)
2461 + ".previous\n"
2462 +#endif
2466 void ScaleYUVToRGB32Row(const uint8* y_buf,
2467 const uint8* u_buf,
2468 const uint8* v_buf,
2469 uint8* rgb_buf,
2470 int width,
2471 - int source_dx) {
2472 - PICScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2473 - &kCoefficientsRgbY[0][0]);
2476 -void PICLinearScaleYUVToRGB32Row(const uint8* y_buf,
2477 - const uint8* u_buf,
2478 - const uint8* v_buf,
2479 - uint8* rgb_buf,
2480 - int width,
2481 - int source_dx,
2482 - int16 *kCoefficientsRgbY);
2483 + int source_dx)
2485 + if (mozilla::supports_sse()) {
2486 + PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2487 + &kCoefficientsRgbY[0][0]);
2488 + return;
2491 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2494 +void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2495 + const uint8* u_buf,
2496 + const uint8* v_buf,
2497 + uint8* rgb_buf,
2498 + int width,
2499 + int source_dx,
2500 + int16 *kCoefficientsRgbY);
2502 asm(
2503 ".text\n"
2504 -#if defined(OS_MACOSX)
2505 -"_PICLinearScaleYUVToRGB32Row:\n"
2506 +#if defined(XP_MACOSX)
2507 +"_PICLinearScaleYUVToRGB32Row_SSE:\n"
2508 #else
2509 -"PICLinearScaleYUVToRGB32Row:\n"
2510 +"PICLinearScaleYUVToRGB32Row_SSE:\n"
2511 #endif
2512 "pusha\n"
2513 "mov 0x24(%esp),%edx\n"
2514 "mov 0x30(%esp),%ebp\n"
2515 "mov 0x34(%esp),%ecx\n"
2516 "mov 0x3c(%esp),%edi\n"
2517 "xor %ebx,%ebx\n"
2519 // source_width = width * source_dx + ebx
2520 "mov 0x34(%esp), %ecx\n"
2521 "imull 0x38(%esp), %ecx\n"
2522 "mov %ecx, 0x34(%esp)\n"
2524 "mov 0x38(%esp), %ecx\n"
2525 "xor %ebx,%ebx\n" // x = 0
2526 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
2527 - "jl .lscaleend\n"
2528 + "jl 1f\n"
2529 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
2530 - "jmp .lscaleend\n"
2532 -".lscaleloop:"
2533 + "jmp 1f\n"
2535 +"0:"
2536 "mov 0x28(%esp),%esi\n"
2537 "mov %ebx,%eax\n"
2538 "sar $0x11,%eax\n"
2540 "movzbl (%esi,%eax,1),%ecx\n"
2541 "movzbl 1(%esi,%eax,1),%esi\n"
2542 "mov %ebx,%eax\n"
2543 "andl $0x1fffe, %eax \n"
2544 @@ -746,17 +821,17 @@ void PICLinearScaleYUVToRGB32Row(const u
2545 "imul %eax, %esi \n"
2546 "xorl $0xffff, %eax \n"
2547 "imul %eax, %ecx \n"
2548 "addl %esi, %ecx \n"
2549 "shrl $16, %ecx \n"
2550 "movq (%edi,%ecx,8),%mm1\n"
2552 "cmp 0x34(%esp), %ebx\n"
2553 - "jge .lscalelastpixel\n"
2554 + "jge 2f\n"
2556 "mov %ebx,%eax\n"
2557 "sar $0x10,%eax\n"
2558 "movzbl (%edx,%eax,1),%ecx\n"
2559 "movzbl 1(%edx,%eax,1),%esi\n"
2560 "mov %ebx,%eax\n"
2561 "add 0x38(%esp),%ebx\n"
2562 "andl $0xffff, %eax \n"
2563 @@ -770,154 +845,71 @@ void PICLinearScaleYUVToRGB32Row(const u
2564 "paddsw %mm0,%mm1\n"
2565 "paddsw %mm0,%mm2\n"
2566 "psraw $0x6,%mm1\n"
2567 "psraw $0x6,%mm2\n"
2568 "packuswb %mm2,%mm1\n"
2569 "movntq %mm1,0x0(%ebp)\n"
2570 "add $0x8,%ebp\n"
2572 -".lscaleend:"
2573 +"1:"
2574 "cmp %ebx, 0x34(%esp)\n"
2575 - "jg .lscaleloop\n"
2576 + "jg 0b\n"
2577 "popa\n"
2578 "ret\n"
2580 -".lscalelastpixel:"
2581 +"2:"
2582 "paddsw %mm0, %mm1\n"
2583 "psraw $6, %mm1\n"
2584 "packuswb %mm1, %mm1\n"
2585 "movd %mm1, (%ebp)\n"
2586 "popa\n"
2587 "ret\n"
2588 +#if !defined(XP_MACOSX)
2589 + ".previous\n"
2590 +#endif
2594 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2595 - const uint8* u_buf,
2596 - const uint8* v_buf,
2597 - uint8* rgb_buf,
2598 - int width,
2599 - int source_dx) {
2600 - PICLinearScaleYUVToRGB32Row(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
2601 - &kCoefficientsRgbY[0][0]);
2604 -#else // USE_MMX
2606 -// C reference code that mimic the YUV assembly.
2607 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2608 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2609 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2611 -static inline void YuvPixel(uint8 y,
2612 - uint8 u,
2613 - uint8 v,
2614 - uint8* rgb_buf) {
2616 - int b = kCoefficientsRgbY[256+u][0];
2617 - int g = kCoefficientsRgbY[256+u][1];
2618 - int r = kCoefficientsRgbY[256+u][2];
2619 - int a = kCoefficientsRgbY[256+u][3];
2621 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2622 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2623 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2624 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2626 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2627 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2628 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2629 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2631 - b >>= 6;
2632 - g >>= 6;
2633 - r >>= 6;
2634 - a >>= 6;
2636 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
2637 - (packuswb(g) << 8) |
2638 - (packuswb(r) << 16) |
2639 - (packuswb(a) << 24);
2642 + const uint8* u_buf,
2643 + const uint8* v_buf,
2644 + uint8* rgb_buf,
2645 + int width,
2646 + int source_dx)
2648 + if (mozilla::supports_sse()) {
2649 + PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
2650 + source_dx, &kCoefficientsRgbY[0][0]);
2651 + return;
2654 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2656 +#else
2657 void FastConvertYUVToRGB32Row(const uint8* y_buf,
2658 const uint8* u_buf,
2659 const uint8* v_buf,
2660 uint8* rgb_buf,
2661 int width) {
2662 - for (int x = 0; x < width; x += 2) {
2663 - uint8 u = u_buf[x >> 1];
2664 - uint8 v = v_buf[x >> 1];
2665 - uint8 y0 = y_buf[x];
2666 - YuvPixel(y0, u, v, rgb_buf);
2667 - if ((x + 1) < width) {
2668 - uint8 y1 = y_buf[x + 1];
2669 - YuvPixel(y1, u, v, rgb_buf + 4);
2671 - rgb_buf += 8; // Advance 2 pixels.
2675 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
2676 -// A shift by 17 is used to further subsample the chrominence channels.
2677 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
2678 -// for 1/65536 pixel accurate interpolation.
2679 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
2682 void ScaleYUVToRGB32Row(const uint8* y_buf,
2683 const uint8* u_buf,
2684 const uint8* v_buf,
2685 uint8* rgb_buf,
2686 int width,
2687 int source_dx) {
2688 - int x = 0;
2689 - for (int i = 0; i < width; i += 2) {
2690 - int y = y_buf[x >> 16];
2691 - int u = u_buf[(x >> 17)];
2692 - int v = v_buf[(x >> 17)];
2693 - YuvPixel(y, u, v, rgb_buf);
2694 - x += source_dx;
2695 - if ((i + 1) < width) {
2696 - y = y_buf[x >> 16];
2697 - YuvPixel(y, u, v, rgb_buf+4);
2698 - x += source_dx;
2700 - rgb_buf += 8;
2703 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2706 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2707 const uint8* u_buf,
2708 const uint8* v_buf,
2709 uint8* rgb_buf,
2710 int width,
2711 int source_dx) {
2712 - int x = 0;
2713 - if (source_dx >= 0x20000) {
2714 - x = 32768;
2716 - for (int i = 0; i < width; i += 2) {
2717 - int y0 = y_buf[x >> 16];
2718 - int y1 = y_buf[(x >> 16) + 1];
2719 - int u0 = u_buf[(x >> 17)];
2720 - int u1 = u_buf[(x >> 17) + 1];
2721 - int v0 = v_buf[(x >> 17)];
2722 - int v1 = v_buf[(x >> 17) + 1];
2723 - int y_frac = (x & 65535);
2724 - int uv_frac = ((x >> 1) & 65535);
2725 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2726 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
2727 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
2728 - YuvPixel(y, u, v, rgb_buf);
2729 - x += source_dx;
2730 - if ((i + 1) < width) {
2731 - y0 = y_buf[x >> 16];
2732 - y1 = y_buf[(x >> 16) + 1];
2733 - y_frac = (x & 65535);
2734 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
2735 - YuvPixel(y, u, v, rgb_buf+4);
2736 - x += source_dx;
2738 - rgb_buf += 8;
2742 -#endif // USE_MMX
2743 -} // extern "C"
2745 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
2747 +#endif
2750 diff --git a/gfx/ycbcr/yuv_row_table.cpp b/gfx/ycbcr/yuv_row_table.cpp
2751 --- a/gfx/ycbcr/yuv_row_table.cpp
2752 +++ b/gfx/ycbcr/yuv_row_table.cpp
2753 @@ -1,13 +1,13 @@
2754 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2755 // Use of this source code is governed by a BSD-style license that can be
2756 // found in the LICENSE file.
2758 -#include "media/base/yuv_row.h"
2759 +#include "yuv_row.h"
2761 extern "C" {
2763 #define RGBY(i) { \
2764 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2765 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2766 static_cast<int16>(1.164 * 64 * (i - 16) + 0.5), \
2768 diff --git a/gfx/ycbcr/yuv_row_win.cpp b/gfx/ycbcr/yuv_row_win.cpp
2769 --- a/gfx/ycbcr/yuv_row_win.cpp
2770 +++ b/gfx/ycbcr/yuv_row_win.cpp
2771 @@ -1,26 +1,27 @@
2772 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2773 // Use of this source code is governed by a BSD-style license that can be
2774 // found in the LICENSE file.
2776 -#include "media/base/yuv_row.h"
2777 +#include "yuv_row.h"
2778 +#include "mozilla/SSE.h"
2780 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
2781 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
2783 extern "C" {
2785 -#if USE_MMX
2786 -__declspec(naked)
2787 -void FastConvertYUVToRGB32Row(const uint8* y_buf,
2788 - const uint8* u_buf,
2789 - const uint8* v_buf,
2790 - uint8* rgb_buf,
2791 - int width) {
2792 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
2793 +__declspec(naked)
2794 +void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2795 + const uint8* u_buf,
2796 + const uint8* v_buf,
2797 + uint8* rgb_buf,
2798 + int width) {
2799 __asm {
2800 pushad
2801 mov edx, [esp + 32 + 4] // Y
2802 mov edi, [esp + 32 + 8] // U
2803 mov esi, [esp + 32 + 12] // V
2804 mov ebp, [esp + 32 + 16] // rgb
2805 mov ecx, [esp + 32 + 20] // width
2806 jmp convertend
2807 @@ -64,22 +65,22 @@ void FastConvertYUVToRGB32Row(const uint
2808 convertdone :
2810 popad
2815 __declspec(naked)
2816 -void ConvertYUVToRGB32Row(const uint8* y_buf,
2817 - const uint8* u_buf,
2818 - const uint8* v_buf,
2819 - uint8* rgb_buf,
2820 - int width,
2821 - int step) {
2822 +void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2823 + const uint8* u_buf,
2824 + const uint8* v_buf,
2825 + uint8* rgb_buf,
2826 + int width,
2827 + int step) {
2828 __asm {
2829 pushad
2830 mov edx, [esp + 32 + 4] // Y
2831 mov edi, [esp + 32 + 8] // U
2832 mov esi, [esp + 32 + 12] // V
2833 mov ebp, [esp + 32 + 16] // rgb
2834 mov ecx, [esp + 32 + 20] // width
2835 mov ebx, [esp + 32 + 24] // step
2836 @@ -125,23 +126,23 @@ void ConvertYUVToRGB32Row(const uint8* y
2837 wdone :
2839 popad
2844 __declspec(naked)
2845 -void RotateConvertYUVToRGB32Row(const uint8* y_buf,
2846 - const uint8* u_buf,
2847 - const uint8* v_buf,
2848 - uint8* rgb_buf,
2849 - int width,
2850 - int ystep,
2851 - int uvstep) {
2852 +void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
2853 + const uint8* u_buf,
2854 + const uint8* v_buf,
2855 + uint8* rgb_buf,
2856 + int width,
2857 + int ystep,
2858 + int uvstep) {
2859 __asm {
2860 pushad
2861 mov edx, [esp + 32 + 4] // Y
2862 mov edi, [esp + 32 + 8] // U
2863 mov esi, [esp + 32 + 12] // V
2864 mov ebp, [esp + 32 + 16] // rgb
2865 mov ecx, [esp + 32 + 20] // width
2866 jmp wend
2867 @@ -188,21 +189,21 @@ void RotateConvertYUVToRGB32Row(const ui
2868 wdone :
2870 popad
2875 __declspec(naked)
2876 -void DoubleYUVToRGB32Row(const uint8* y_buf,
2877 - const uint8* u_buf,
2878 - const uint8* v_buf,
2879 - uint8* rgb_buf,
2880 - int width) {
2881 +void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
2882 + const uint8* u_buf,
2883 + const uint8* v_buf,
2884 + uint8* rgb_buf,
2885 + int width) {
2886 __asm {
2887 pushad
2888 mov edx, [esp + 32 + 4] // Y
2889 mov edi, [esp + 32 + 8] // U
2890 mov esi, [esp + 32 + 12] // V
2891 mov ebp, [esp + 32 + 16] // rgb
2892 mov ecx, [esp + 32 + 20] // width
2893 jmp wend
2894 @@ -256,26 +257,26 @@ void DoubleYUVToRGB32Row(const uint8* y_
2895 jns wloop1
2896 wdone :
2897 popad
2902 // This version does general purpose scaling by any amount, up or down.
2903 -// The only thing it can not do it rotation by 90 or 270.
2904 -// For performance the chroma is under sampled, reducing cost of a 3x
2905 +// The only thing it cannot do is rotation by 90 or 270.
2906 +// For performance the chroma is under-sampled, reducing cost of a 3x
2907 // 1080p scale from 8.4 ms to 5.4 ms.
2908 __declspec(naked)
2909 -void ScaleYUVToRGB32Row(const uint8* y_buf,
2910 - const uint8* u_buf,
2911 - const uint8* v_buf,
2912 - uint8* rgb_buf,
2913 - int width,
2914 - int source_dx) {
2915 +void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2916 + const uint8* u_buf,
2917 + const uint8* v_buf,
2918 + uint8* rgb_buf,
2919 + int width,
2920 + int source_dx) {
2921 __asm {
2922 pushad
2923 mov edx, [esp + 32 + 4] // Y
2924 mov edi, [esp + 32 + 8] // U
2925 mov esi, [esp + 32 + 12] // V
2926 mov ebp, [esp + 32 + 16] // rgb
2927 mov ecx, [esp + 32 + 20] // width
2928 xor ebx, ebx // x
2929 @@ -333,22 +334,22 @@ void ScaleYUVToRGB32Row(const uint8* y_b
2931 scaledone :
2932 popad
2937 __declspec(naked)
2938 -void LinearScaleYUVToRGB32Row(const uint8* y_buf,
2939 - const uint8* u_buf,
2940 - const uint8* v_buf,
2941 - uint8* rgb_buf,
2942 - int width,
2943 - int source_dx) {
2944 +void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
2945 + const uint8* u_buf,
2946 + const uint8* v_buf,
2947 + uint8* rgb_buf,
2948 + int width,
2949 + int source_dx) {
2950 __asm {
2951 pushad
2952 mov edx, [esp + 32 + 4] // Y
2953 mov edi, [esp + 32 + 8] // U
2954 // [esp + 32 + 12] // V
2955 mov ebp, [esp + 32 + 16] // rgb
2956 mov ecx, [esp + 32 + 20] // width
2957 imul ecx, [esp + 32 + 24] // source_dx
2958 @@ -438,152 +439,60 @@ lscalelastpixel:
2959 paddsw mm1, mm0
2960 psraw mm1, 6
2961 packuswb mm1, mm1
2962 movd [ebp], mm1
2963 popad
2967 -#else // USE_MMX
2969 -// C reference code that mimic the YUV assembly.
2970 -#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
2971 -#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
2972 - (((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
2974 -static inline void YuvPixel(uint8 y,
2975 - uint8 u,
2976 - uint8 v,
2977 - uint8* rgb_buf) {
2979 - int b = kCoefficientsRgbY[256+u][0];
2980 - int g = kCoefficientsRgbY[256+u][1];
2981 - int r = kCoefficientsRgbY[256+u][2];
2982 - int a = kCoefficientsRgbY[256+u][3];
2984 - b = paddsw(b, kCoefficientsRgbY[512+v][0]);
2985 - g = paddsw(g, kCoefficientsRgbY[512+v][1]);
2986 - r = paddsw(r, kCoefficientsRgbY[512+v][2]);
2987 - a = paddsw(a, kCoefficientsRgbY[512+v][3]);
2989 - b = paddsw(b, kCoefficientsRgbY[y][0]);
2990 - g = paddsw(g, kCoefficientsRgbY[y][1]);
2991 - r = paddsw(r, kCoefficientsRgbY[y][2]);
2992 - a = paddsw(a, kCoefficientsRgbY[y][3]);
2994 - b >>= 6;
2995 - g >>= 6;
2996 - r >>= 6;
2997 - a >>= 6;
2999 - *reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b)) |
3000 - (packuswb(g) << 8) |
3001 - (packuswb(r) << 16) |
3002 - (packuswb(a) << 24);
3005 -#if TEST_MMX_YUV
3006 -static inline void YuvPixel(uint8 y,
3007 - uint8 u,
3008 - uint8 v,
3009 - uint8* rgb_buf) {
3011 - __asm {
3012 - movzx eax, u
3013 - movq mm0, [kCoefficientsRgbY+2048 + 8 * eax]
3014 - movzx eax, v
3015 - paddsw mm0, [kCoefficientsRgbY+4096 + 8 * eax]
3016 - movzx eax, y
3017 - movq mm1, [kCoefficientsRgbY + 8 * eax]
3018 - paddsw mm1, mm0
3019 - psraw mm1, 6
3020 - packuswb mm1, mm1
3021 - mov eax, rgb_buf
3022 - movd [eax], mm1
3023 - emms
3026 -#endif
3027 +#endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3029 void FastConvertYUVToRGB32Row(const uint8* y_buf,
3030 const uint8* u_buf,
3031 const uint8* v_buf,
3032 uint8* rgb_buf,
3033 int width) {
3034 - for (int x = 0; x < width; x += 2) {
3035 - uint8 u = u_buf[x >> 1];
3036 - uint8 v = v_buf[x >> 1];
3037 - uint8 y0 = y_buf[x];
3038 - YuvPixel(y0, u, v, rgb_buf);
3039 - if ((x + 1) < width) {
3040 - uint8 y1 = y_buf[x + 1];
3041 - YuvPixel(y1, u, v, rgb_buf + 4);
3043 - rgb_buf += 8; // Advance 2 pixels.
3047 -// 16.16 fixed point is used. A shift by 16 isolates the integer.
3048 -// A shift by 17 is used to further subsample the chrominence channels.
3049 -// & 0xffff isolates the fixed point fraction. >> 2 to get the upper 2 bits,
3050 -// for 1/65536 pixel accurate interpolation.
3051 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3052 + if (mozilla::supports_sse()) {
3053 + FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
3054 + return;
3056 +#endif
3058 + FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
3061 void ScaleYUVToRGB32Row(const uint8* y_buf,
3062 const uint8* u_buf,
3063 const uint8* v_buf,
3064 uint8* rgb_buf,
3065 int width,
3066 int source_dx) {
3067 - int x = 0;
3068 - for (int i = 0; i < width; i += 2) {
3069 - int y = y_buf[x >> 16];
3070 - int u = u_buf[(x >> 17)];
3071 - int v = v_buf[(x >> 17)];
3072 - YuvPixel(y, u, v, rgb_buf);
3073 - x += source_dx;
3074 - if ((i + 1) < width) {
3075 - y = y_buf[x >> 16];
3076 - YuvPixel(y, u, v, rgb_buf+4);
3077 - x += source_dx;
3079 - rgb_buf += 8;
3083 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3084 + if (mozilla::supports_sse()) {
3085 + ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3086 + return;
3088 +#endif
3090 + ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3093 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
3094 const uint8* u_buf,
3095 const uint8* v_buf,
3096 uint8* rgb_buf,
3097 int width,
3098 int source_dx) {
3099 - int x = 0;
3100 - if (source_dx >= 0x20000) {
3101 - x = 32768;
3103 - for (int i = 0; i < width; i += 2) {
3104 - int y0 = y_buf[x >> 16];
3105 - int y1 = y_buf[(x >> 16) + 1];
3106 - int u0 = u_buf[(x >> 17)];
3107 - int u1 = u_buf[(x >> 17) + 1];
3108 - int v0 = v_buf[(x >> 17)];
3109 - int v1 = v_buf[(x >> 17) + 1];
3110 - int y_frac = (x & 65535);
3111 - int uv_frac = ((x >> 1) & 65535);
3112 - int y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3113 - int u = (uv_frac * u1 + (uv_frac ^ 65535) * u0) >> 16;
3114 - int v = (uv_frac * v1 + (uv_frac ^ 65535) * v0) >> 16;
3115 - YuvPixel(y, u, v, rgb_buf);
3116 - x += source_dx;
3117 - if ((i + 1) < width) {
3118 - y0 = y_buf[x >> 16];
3119 - y1 = y_buf[(x >> 16) + 1];
3120 - y_frac = (x & 65535);
3121 - y = (y_frac * y1 + (y_frac ^ 65535) * y0) >> 16;
3122 - YuvPixel(y, u, v, rgb_buf+4);
3123 - x += source_dx;
3125 - rgb_buf += 8;
3129 -#endif // USE_MMX
3130 -} // extern "C"
3132 +#if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
3133 + if (mozilla::supports_sse()) {
3134 + LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
3135 + source_dx);
3136 + return;
3138 +#endif
3140 + LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
3143 +} // extern "C"