1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "mozilla/SSE.h"
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
14 #if defined(__clang__)
15 // clang-cl has a bug where it doesn't mangle names in inline asm
16 // so let's do the mangling in the preprocessor (ugh)
17 // (but we still need to declare a dummy extern for the parser)
18 extern void* _kCoefficientsRgbY
;
19 #define kCoefficientsRgbY _kCoefficientsRgbY
23 void FastConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
30 mov edx
, [esp
+ 32 + 4] // Y
31 mov edi
, [esp
+ 32 + 8] // U
32 mov esi
, [esp
+ 32 + 12] // V
33 mov ebp
, [esp
+ 32 + 16] // rgb
34 mov ecx
, [esp
+ 32 + 20] // width
38 movzx eax
, byte ptr
[edi
]
40 movzx ebx
, byte ptr
[esi
]
42 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
43 movzx eax
, byte ptr
[edx
]
44 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ebx
]
45 movzx ebx
, byte ptr
[edx
+ 1]
46 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
48 movq mm2
, [kCoefficientsRgbY
+ 8 * ebx
]
60 and ecx
, 1 // odd number of pixels?
63 movzx eax
, byte ptr
[edi
]
64 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
65 movzx eax
, byte ptr
[esi
]
66 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
67 movzx eax
, byte ptr
[edx
]
68 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
81 void ConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
89 mov edx
, [esp
+ 32 + 4] // Y
90 mov edi
, [esp
+ 32 + 8] // U
91 mov esi
, [esp
+ 32 + 12] // V
92 mov ebp
, [esp
+ 32 + 16] // rgb
93 mov ecx
, [esp
+ 32 + 20] // width
94 mov ebx
, [esp
+ 32 + 24] // step
98 movzx eax
, byte ptr
[edi
]
100 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
101 movzx eax
, byte ptr
[esi
]
103 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
104 movzx eax
, byte ptr
[edx
]
106 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
107 movzx eax
, byte ptr
[edx
]
109 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
121 and ecx
, 1 // odd number of pixels?
124 movzx eax
, byte ptr
[edi
]
125 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
126 movzx eax
, byte ptr
[esi
]
127 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
128 movzx eax
, byte ptr
[edx
]
129 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
142 void RotateConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
151 mov edx
, [esp
+ 32 + 4] // Y
152 mov edi
, [esp
+ 32 + 8] // U
153 mov esi
, [esp
+ 32 + 12] // V
154 mov ebp
, [esp
+ 32 + 16] // rgb
155 mov ecx
, [esp
+ 32 + 20] // width
159 movzx eax
, byte ptr
[edi
]
160 mov ebx
, [esp
+ 32 + 28] // uvstep
162 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
163 movzx eax
, byte ptr
[esi
]
165 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
166 movzx eax
, byte ptr
[edx
]
167 mov ebx
, [esp
+ 32 + 24] // ystep
169 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
170 movzx eax
, byte ptr
[edx
]
172 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
184 and ecx
, 1 // odd number of pixels?
187 movzx eax
, byte ptr
[edi
]
188 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
189 movzx eax
, byte ptr
[esi
]
190 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
191 movzx eax
, byte ptr
[edx
]
192 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
205 void DoubleYUVToRGB32Row_SSE(const uint8
* y_buf
,
212 mov edx
, [esp
+ 32 + 4] // Y
213 mov edi
, [esp
+ 32 + 8] // U
214 mov esi
, [esp
+ 32 + 12] // V
215 mov ebp
, [esp
+ 32 + 16] // rgb
216 mov ecx
, [esp
+ 32 + 20] // width
220 movzx eax
, byte ptr
[edi
]
222 movzx ebx
, byte ptr
[esi
]
224 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
225 movzx eax
, byte ptr
[edx
]
226 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ebx
]
227 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
234 movzx ebx
, byte ptr
[edx
+ 1]
236 paddsw mm0
, [kCoefficientsRgbY
+ 8 * ebx
]
249 movzx eax
, byte ptr
[edi
]
250 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
251 movzx eax
, byte ptr
[esi
]
252 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
253 movzx eax
, byte ptr
[edx
]
254 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
272 // This version does general purpose scaling by any amount, up or down.
273 // The only thing it cannot do is rotation by 90 or 270.
274 // For performance the chroma is under-sampled, reducing cost of a 3x
275 // 1080p scale from 8.4 ms to 5.4 ms.
277 void ScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
285 mov edx
, [esp
+ 32 + 4] // Y
286 mov edi
, [esp
+ 32 + 8] // U
287 mov esi
, [esp
+ 32 + 12] // V
288 mov ebp
, [esp
+ 32 + 16] // rgb
289 mov ecx
, [esp
+ 32 + 20] // width
296 movzx eax
, byte ptr
[edi
+ eax
]
297 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
300 movzx eax
, byte ptr
[esi
+ eax
]
301 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
303 add ebx
, [esp
+ 32 + 24] // x += source_dx
305 movzx eax
, byte ptr
[edx
+ eax
]
306 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
308 add ebx
, [esp
+ 32 + 24] // x += source_dx
310 movzx eax
, byte ptr
[edx
+ eax
]
311 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
323 and ecx
, 1 // odd number of pixels?
328 movzx eax
, byte ptr
[edi
+ eax
]
329 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
332 movzx eax
, byte ptr
[esi
+ eax
]
333 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
336 movzx eax
, byte ptr
[edx
+ eax
]
337 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
350 void LinearScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
358 mov edx
, [esp
+ 32 + 4] // Y
359 mov edi
, [esp
+ 32 + 8] // U
360 // [esp + 32 + 12] // V
361 mov ebp
, [esp
+ 32 + 16] // rgb
362 mov ecx
, [esp
+ 32 + 20] // width
363 imul ecx
, [esp
+ 32 + 24] // source_dx
364 mov
[esp
+ 32 + 20], ecx
// source_width = width * source_dx
365 mov ecx
, [esp
+ 32 + 24] // source_dx
366 xor ebx
, ebx
// x = 0
369 mov ebx
, 0x8000 // x = 0.5 for 1/2 or less
375 movzx ecx
, byte ptr
[edi
+ eax
]
376 movzx esi
, byte ptr
[edi
+ eax
+ 1]
384 movq mm0
, [kCoefficientsRgbU
+ 8 * ecx
]
386 mov esi
, [esp
+ 32 + 12]
390 movzx ecx
, byte ptr
[esi
+ eax
]
391 movzx esi
, byte ptr
[esi
+ eax
+ 1]
399 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ecx
]
403 movzx ecx
, byte ptr
[edx
+ eax
]
404 movzx esi
, byte ptr
[1 + edx
+ eax
]
406 add ebx
, [esp
+ 32 + 24]
413 movq mm1
, [kCoefficientsRgbY
+ 8 * ecx
]
415 cmp ebx
, [esp
+ 32 + 20]
420 movzx ecx
, byte ptr
[edx
+ eax
]
421 movzx esi
, byte ptr
[edx
+ eax
+ 1]
423 add ebx
, [esp
+ 32 + 24]
430 movq mm2
, [kCoefficientsRgbY
+ 8 * ecx
]
441 cmp ebx
, [esp
+ 32 + 20]
455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
457 void FastConvertYUVToRGB32Row(const uint8
* y_buf
,
462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
463 if (mozilla::supports_sse()) {
464 FastConvertYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
);
469 FastConvertYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, 1);
472 void ScaleYUVToRGB32Row(const uint8
* y_buf
,
479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
480 if (mozilla::supports_sse()) {
481 ScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
486 ScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
489 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
496 if (mozilla::supports_sse()) {
497 LinearScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
,
503 LinearScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);