1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "mozilla/SSE.h"
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
15 void FastConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
22 mov edx
, [esp
+ 32 + 4] // Y
23 mov edi
, [esp
+ 32 + 8] // U
24 mov esi
, [esp
+ 32 + 12] // V
25 mov ebp
, [esp
+ 32 + 16] // rgb
26 mov ecx
, [esp
+ 32 + 20] // width
30 movzx eax
, byte ptr
[edi
]
32 movzx ebx
, byte ptr
[esi
]
34 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
35 movzx eax
, byte ptr
[edx
]
36 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ebx
]
37 movzx ebx
, byte ptr
[edx
+ 1]
38 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
40 movq mm2
, [kCoefficientsRgbY
+ 8 * ebx
]
52 and ecx
, 1 // odd number of pixels?
55 movzx eax
, byte ptr
[edi
]
56 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
57 movzx eax
, byte ptr
[esi
]
58 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
59 movzx eax
, byte ptr
[edx
]
60 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
73 void ConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
81 mov edx
, [esp
+ 32 + 4] // Y
82 mov edi
, [esp
+ 32 + 8] // U
83 mov esi
, [esp
+ 32 + 12] // V
84 mov ebp
, [esp
+ 32 + 16] // rgb
85 mov ecx
, [esp
+ 32 + 20] // width
86 mov ebx
, [esp
+ 32 + 24] // step
90 movzx eax
, byte ptr
[edi
]
92 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
93 movzx eax
, byte ptr
[esi
]
95 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
96 movzx eax
, byte ptr
[edx
]
98 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
99 movzx eax
, byte ptr
[edx
]
101 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
113 and ecx
, 1 // odd number of pixels?
116 movzx eax
, byte ptr
[edi
]
117 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
118 movzx eax
, byte ptr
[esi
]
119 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
120 movzx eax
, byte ptr
[edx
]
121 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
134 void RotateConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
143 mov edx
, [esp
+ 32 + 4] // Y
144 mov edi
, [esp
+ 32 + 8] // U
145 mov esi
, [esp
+ 32 + 12] // V
146 mov ebp
, [esp
+ 32 + 16] // rgb
147 mov ecx
, [esp
+ 32 + 20] // width
151 movzx eax
, byte ptr
[edi
]
152 mov ebx
, [esp
+ 32 + 28] // uvstep
154 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
155 movzx eax
, byte ptr
[esi
]
157 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
158 movzx eax
, byte ptr
[edx
]
159 mov ebx
, [esp
+ 32 + 24] // ystep
161 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
162 movzx eax
, byte ptr
[edx
]
164 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
176 and ecx
, 1 // odd number of pixels?
179 movzx eax
, byte ptr
[edi
]
180 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
181 movzx eax
, byte ptr
[esi
]
182 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
183 movzx eax
, byte ptr
[edx
]
184 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
197 void DoubleYUVToRGB32Row_SSE(const uint8
* y_buf
,
204 mov edx
, [esp
+ 32 + 4] // Y
205 mov edi
, [esp
+ 32 + 8] // U
206 mov esi
, [esp
+ 32 + 12] // V
207 mov ebp
, [esp
+ 32 + 16] // rgb
208 mov ecx
, [esp
+ 32 + 20] // width
212 movzx eax
, byte ptr
[edi
]
214 movzx ebx
, byte ptr
[esi
]
216 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
217 movzx eax
, byte ptr
[edx
]
218 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ebx
]
219 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
226 movzx ebx
, byte ptr
[edx
+ 1]
228 paddsw mm0
, [kCoefficientsRgbY
+ 8 * ebx
]
241 movzx eax
, byte ptr
[edi
]
242 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
243 movzx eax
, byte ptr
[esi
]
244 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
245 movzx eax
, byte ptr
[edx
]
246 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
264 // This version does general purpose scaling by any amount, up or down.
265 // The only thing it cannot do is rotation by 90 or 270.
266 // For performance the chroma is under-sampled, reducing cost of a 3x
267 // 1080p scale from 8.4 ms to 5.4 ms.
269 void ScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
277 mov edx
, [esp
+ 32 + 4] // Y
278 mov edi
, [esp
+ 32 + 8] // U
279 mov esi
, [esp
+ 32 + 12] // V
280 mov ebp
, [esp
+ 32 + 16] // rgb
281 mov ecx
, [esp
+ 32 + 20] // width
288 movzx eax
, byte ptr
[edi
+ eax
]
289 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
292 movzx eax
, byte ptr
[esi
+ eax
]
293 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
295 add ebx
, [esp
+ 32 + 24] // x += source_dx
297 movzx eax
, byte ptr
[edx
+ eax
]
298 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
300 add ebx
, [esp
+ 32 + 24] // x += source_dx
302 movzx eax
, byte ptr
[edx
+ eax
]
303 movq mm2
, [kCoefficientsRgbY
+ 8 * eax
]
315 and ecx
, 1 // odd number of pixels?
320 movzx eax
, byte ptr
[edi
+ eax
]
321 movq mm0
, [kCoefficientsRgbU
+ 8 * eax
]
324 movzx eax
, byte ptr
[esi
+ eax
]
325 paddsw mm0
, [kCoefficientsRgbV
+ 8 * eax
]
328 movzx eax
, byte ptr
[edx
+ eax
]
329 movq mm1
, [kCoefficientsRgbY
+ 8 * eax
]
342 void LinearScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
350 mov edx
, [esp
+ 32 + 4] // Y
351 mov edi
, [esp
+ 32 + 8] // U
352 // [esp + 32 + 12] // V
353 mov ebp
, [esp
+ 32 + 16] // rgb
354 mov ecx
, [esp
+ 32 + 20] // width
355 imul ecx
, [esp
+ 32 + 24] // source_dx
356 mov
[esp
+ 32 + 20], ecx
// source_width = width * source_dx
357 mov ecx
, [esp
+ 32 + 24] // source_dx
358 xor ebx
, ebx
// x = 0
361 mov ebx
, 0x8000 // x = 0.5 for 1/2 or less
367 movzx ecx
, byte ptr
[edi
+ eax
]
368 movzx esi
, byte ptr
[edi
+ eax
+ 1]
376 movq mm0
, [kCoefficientsRgbU
+ 8 * ecx
]
378 mov esi
, [esp
+ 32 + 12]
382 movzx ecx
, byte ptr
[esi
+ eax
]
383 movzx esi
, byte ptr
[esi
+ eax
+ 1]
391 paddsw mm0
, [kCoefficientsRgbV
+ 8 * ecx
]
395 movzx ecx
, byte ptr
[edx
+ eax
]
396 movzx esi
, byte ptr
[1 + edx
+ eax
]
398 add ebx
, [esp
+ 32 + 24]
405 movq mm1
, [kCoefficientsRgbY
+ 8 * ecx
]
407 cmp ebx
, [esp
+ 32 + 20]
412 movzx ecx
, byte ptr
[edx
+ eax
]
413 movzx esi
, byte ptr
[edx
+ eax
+ 1]
415 add ebx
, [esp
+ 32 + 24]
422 movq mm2
, [kCoefficientsRgbY
+ 8 * ecx
]
433 cmp ebx
, [esp
+ 32 + 20]
447 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
449 void FastConvertYUVToRGB32Row(const uint8
* y_buf
,
454 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
455 if (mozilla::supports_sse()) {
456 FastConvertYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
);
461 FastConvertYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, 1);
464 void ScaleYUVToRGB32Row(const uint8
* y_buf
,
471 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
472 if (mozilla::supports_sse()) {
473 ScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
478 ScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
481 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
487 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
488 if (mozilla::supports_sse()) {
489 LinearScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
,
495 LinearScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);