Bumping manifests a=b2g-bump
[gecko.git] / gfx / ycbcr / yuv_row_win.cpp
blob5cd931139e9d79a9e5b0353f03c94968d0c7e0fa
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "yuv_row.h"
6 #include "mozilla/SSE.h"
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
11 extern "C" {
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
14 __declspec(naked)
15 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
16 const uint8* u_buf,
17 const uint8* v_buf,
18 uint8* rgb_buf,
19 int width) {
20 __asm {
21 pushad
22 mov edx, [esp + 32 + 4] // Y
23 mov edi, [esp + 32 + 8] // U
24 mov esi, [esp + 32 + 12] // V
25 mov ebp, [esp + 32 + 16] // rgb
26 mov ecx, [esp + 32 + 20] // width
27 jmp convertend
29 convertloop :
30 movzx eax, byte ptr [edi]
31 add edi, 1
32 movzx ebx, byte ptr [esi]
33 add esi, 1
34 movq mm0, [kCoefficientsRgbU + 8 * eax]
35 movzx eax, byte ptr [edx]
36 paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
37 movzx ebx, byte ptr [edx + 1]
38 movq mm1, [kCoefficientsRgbY + 8 * eax]
39 add edx, 2
40 movq mm2, [kCoefficientsRgbY + 8 * ebx]
41 paddsw mm1, mm0
42 paddsw mm2, mm0
43 psraw mm1, 6
44 psraw mm2, 6
45 packuswb mm1, mm2
46 movntq [ebp], mm1
47 add ebp, 8
48 convertend :
49 sub ecx, 2
50 jns convertloop
52 and ecx, 1 // odd number of pixels?
53 jz convertdone
55 movzx eax, byte ptr [edi]
56 movq mm0, [kCoefficientsRgbU + 8 * eax]
57 movzx eax, byte ptr [esi]
58 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
59 movzx eax, byte ptr [edx]
60 movq mm1, [kCoefficientsRgbY + 8 * eax]
61 paddsw mm1, mm0
62 psraw mm1, 6
63 packuswb mm1, mm1
64 movd [ebp], mm1
65 convertdone :
67 popad
68 ret
72 __declspec(naked)
73 void ConvertYUVToRGB32Row_SSE(const uint8* y_buf,
74 const uint8* u_buf,
75 const uint8* v_buf,
76 uint8* rgb_buf,
77 int width,
78 int step) {
79 __asm {
80 pushad
81 mov edx, [esp + 32 + 4] // Y
82 mov edi, [esp + 32 + 8] // U
83 mov esi, [esp + 32 + 12] // V
84 mov ebp, [esp + 32 + 16] // rgb
85 mov ecx, [esp + 32 + 20] // width
86 mov ebx, [esp + 32 + 24] // step
87 jmp wend
89 wloop :
90 movzx eax, byte ptr [edi]
91 add edi, ebx
92 movq mm0, [kCoefficientsRgbU + 8 * eax]
93 movzx eax, byte ptr [esi]
94 add esi, ebx
95 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
96 movzx eax, byte ptr [edx]
97 add edx, ebx
98 movq mm1, [kCoefficientsRgbY + 8 * eax]
99 movzx eax, byte ptr [edx]
100 add edx, ebx
101 movq mm2, [kCoefficientsRgbY + 8 * eax]
102 paddsw mm1, mm0
103 paddsw mm2, mm0
104 psraw mm1, 6
105 psraw mm2, 6
106 packuswb mm1, mm2
107 movntq [ebp], mm1
108 add ebp, 8
109 wend :
110 sub ecx, 2
111 jns wloop
113 and ecx, 1 // odd number of pixels?
114 jz wdone
116 movzx eax, byte ptr [edi]
117 movq mm0, [kCoefficientsRgbU + 8 * eax]
118 movzx eax, byte ptr [esi]
119 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
120 movzx eax, byte ptr [edx]
121 movq mm1, [kCoefficientsRgbY + 8 * eax]
122 paddsw mm1, mm0
123 psraw mm1, 6
124 packuswb mm1, mm1
125 movd [ebp], mm1
126 wdone :
128 popad
133 __declspec(naked)
134 void RotateConvertYUVToRGB32Row_SSE(const uint8* y_buf,
135 const uint8* u_buf,
136 const uint8* v_buf,
137 uint8* rgb_buf,
138 int width,
139 int ystep,
140 int uvstep) {
141 __asm {
142 pushad
143 mov edx, [esp + 32 + 4] // Y
144 mov edi, [esp + 32 + 8] // U
145 mov esi, [esp + 32 + 12] // V
146 mov ebp, [esp + 32 + 16] // rgb
147 mov ecx, [esp + 32 + 20] // width
148 jmp wend
150 wloop :
151 movzx eax, byte ptr [edi]
152 mov ebx, [esp + 32 + 28] // uvstep
153 add edi, ebx
154 movq mm0, [kCoefficientsRgbU + 8 * eax]
155 movzx eax, byte ptr [esi]
156 add esi, ebx
157 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
158 movzx eax, byte ptr [edx]
159 mov ebx, [esp + 32 + 24] // ystep
160 add edx, ebx
161 movq mm1, [kCoefficientsRgbY + 8 * eax]
162 movzx eax, byte ptr [edx]
163 add edx, ebx
164 movq mm2, [kCoefficientsRgbY + 8 * eax]
165 paddsw mm1, mm0
166 paddsw mm2, mm0
167 psraw mm1, 6
168 psraw mm2, 6
169 packuswb mm1, mm2
170 movntq [ebp], mm1
171 add ebp, 8
172 wend :
173 sub ecx, 2
174 jns wloop
176 and ecx, 1 // odd number of pixels?
177 jz wdone
179 movzx eax, byte ptr [edi]
180 movq mm0, [kCoefficientsRgbU + 8 * eax]
181 movzx eax, byte ptr [esi]
182 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
183 movzx eax, byte ptr [edx]
184 movq mm1, [kCoefficientsRgbY + 8 * eax]
185 paddsw mm1, mm0
186 psraw mm1, 6
187 packuswb mm1, mm1
188 movd [ebp], mm1
189 wdone :
191 popad
196 __declspec(naked)
197 void DoubleYUVToRGB32Row_SSE(const uint8* y_buf,
198 const uint8* u_buf,
199 const uint8* v_buf,
200 uint8* rgb_buf,
201 int width) {
202 __asm {
203 pushad
204 mov edx, [esp + 32 + 4] // Y
205 mov edi, [esp + 32 + 8] // U
206 mov esi, [esp + 32 + 12] // V
207 mov ebp, [esp + 32 + 16] // rgb
208 mov ecx, [esp + 32 + 20] // width
209 jmp wend
211 wloop :
212 movzx eax, byte ptr [edi]
213 add edi, 1
214 movzx ebx, byte ptr [esi]
215 add esi, 1
216 movq mm0, [kCoefficientsRgbU + 8 * eax]
217 movzx eax, byte ptr [edx]
218 paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
219 movq mm1, [kCoefficientsRgbY + 8 * eax]
220 paddsw mm1, mm0
221 psraw mm1, 6
222 packuswb mm1, mm1
223 punpckldq mm1, mm1
224 movntq [ebp], mm1
226 movzx ebx, byte ptr [edx + 1]
227 add edx, 2
228 paddsw mm0, [kCoefficientsRgbY + 8 * ebx]
229 psraw mm0, 6
230 packuswb mm0, mm0
231 punpckldq mm0, mm0
232 movntq [ebp+8], mm0
233 add ebp, 16
234 wend :
235 sub ecx, 4
236 jns wloop
238 add ecx, 4
239 jz wdone
241 movzx eax, byte ptr [edi]
242 movq mm0, [kCoefficientsRgbU + 8 * eax]
243 movzx eax, byte ptr [esi]
244 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
245 movzx eax, byte ptr [edx]
246 movq mm1, [kCoefficientsRgbY + 8 * eax]
247 paddsw mm1, mm0
248 psraw mm1, 6
249 packuswb mm1, mm1
250 jmp wend1
252 wloop1 :
253 movd [ebp], mm1
254 add ebp, 4
255 wend1 :
256 sub ecx, 1
257 jns wloop1
258 wdone :
259 popad
264 // This version does general purpose scaling by any amount, up or down.
265 // The only thing it cannot do is rotation by 90 or 270.
266 // For performance the chroma is under-sampled, reducing cost of a 3x
267 // 1080p scale from 8.4 ms to 5.4 ms.
268 __declspec(naked)
269 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
270 const uint8* u_buf,
271 const uint8* v_buf,
272 uint8* rgb_buf,
273 int width,
274 int source_dx) {
275 __asm {
276 pushad
277 mov edx, [esp + 32 + 4] // Y
278 mov edi, [esp + 32 + 8] // U
279 mov esi, [esp + 32 + 12] // V
280 mov ebp, [esp + 32 + 16] // rgb
281 mov ecx, [esp + 32 + 20] // width
282 xor ebx, ebx // x
283 jmp scaleend
285 scaleloop :
286 mov eax, ebx
287 sar eax, 17
288 movzx eax, byte ptr [edi + eax]
289 movq mm0, [kCoefficientsRgbU + 8 * eax]
290 mov eax, ebx
291 sar eax, 17
292 movzx eax, byte ptr [esi + eax]
293 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
294 mov eax, ebx
295 add ebx, [esp + 32 + 24] // x += source_dx
296 sar eax, 16
297 movzx eax, byte ptr [edx + eax]
298 movq mm1, [kCoefficientsRgbY + 8 * eax]
299 mov eax, ebx
300 add ebx, [esp + 32 + 24] // x += source_dx
301 sar eax, 16
302 movzx eax, byte ptr [edx + eax]
303 movq mm2, [kCoefficientsRgbY + 8 * eax]
304 paddsw mm1, mm0
305 paddsw mm2, mm0
306 psraw mm1, 6
307 psraw mm2, 6
308 packuswb mm1, mm2
309 movntq [ebp], mm1
310 add ebp, 8
311 scaleend :
312 sub ecx, 2
313 jns scaleloop
315 and ecx, 1 // odd number of pixels?
316 jz scaledone
318 mov eax, ebx
319 sar eax, 17
320 movzx eax, byte ptr [edi + eax]
321 movq mm0, [kCoefficientsRgbU + 8 * eax]
322 mov eax, ebx
323 sar eax, 17
324 movzx eax, byte ptr [esi + eax]
325 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
326 mov eax, ebx
327 sar eax, 16
328 movzx eax, byte ptr [edx + eax]
329 movq mm1, [kCoefficientsRgbY + 8 * eax]
330 paddsw mm1, mm0
331 psraw mm1, 6
332 packuswb mm1, mm1
333 movd [ebp], mm1
335 scaledone :
336 popad
341 __declspec(naked)
342 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
343 const uint8* u_buf,
344 const uint8* v_buf,
345 uint8* rgb_buf,
346 int width,
347 int source_dx) {
348 __asm {
349 pushad
350 mov edx, [esp + 32 + 4] // Y
351 mov edi, [esp + 32 + 8] // U
352 // [esp + 32 + 12] // V
353 mov ebp, [esp + 32 + 16] // rgb
354 mov ecx, [esp + 32 + 20] // width
355 imul ecx, [esp + 32 + 24] // source_dx
356 mov [esp + 32 + 20], ecx // source_width = width * source_dx
357 mov ecx, [esp + 32 + 24] // source_dx
358 xor ebx, ebx // x = 0
359 cmp ecx, 0x20000
360 jl lscaleend
361 mov ebx, 0x8000 // x = 0.5 for 1/2 or less
362 jmp lscaleend
363 lscaleloop:
364 mov eax, ebx
365 sar eax, 0x11
367 movzx ecx, byte ptr [edi + eax]
368 movzx esi, byte ptr [edi + eax + 1]
369 mov eax, ebx
370 and eax, 0x1fffe
371 imul esi, eax
372 xor eax, 0x1fffe
373 imul ecx, eax
374 add ecx, esi
375 shr ecx, 17
376 movq mm0, [kCoefficientsRgbU + 8 * ecx]
378 mov esi, [esp + 32 + 12]
379 mov eax, ebx
380 sar eax, 0x11
382 movzx ecx, byte ptr [esi + eax]
383 movzx esi, byte ptr [esi + eax + 1]
384 mov eax, ebx
385 and eax, 0x1fffe
386 imul esi, eax
387 xor eax, 0x1fffe
388 imul ecx, eax
389 add ecx, esi
390 shr ecx, 17
391 paddsw mm0, [kCoefficientsRgbV + 8 * ecx]
393 mov eax, ebx
394 sar eax, 0x10
395 movzx ecx, byte ptr [edx + eax]
396 movzx esi, byte ptr [1 + edx + eax]
397 mov eax, ebx
398 add ebx, [esp + 32 + 24]
399 and eax, 0xffff
400 imul esi, eax
401 xor eax, 0xffff
402 imul ecx, eax
403 add ecx, esi
404 shr ecx, 16
405 movq mm1, [kCoefficientsRgbY + 8 * ecx]
407 cmp ebx, [esp + 32 + 20]
408 jge lscalelastpixel
410 mov eax, ebx
411 sar eax, 0x10
412 movzx ecx, byte ptr [edx + eax]
413 movzx esi, byte ptr [edx + eax + 1]
414 mov eax, ebx
415 add ebx, [esp + 32 + 24]
416 and eax, 0xffff
417 imul esi, eax
418 xor eax, 0xffff
419 imul ecx, eax
420 add ecx, esi
421 shr ecx, 16
422 movq mm2, [kCoefficientsRgbY + 8 * ecx]
424 paddsw mm1, mm0
425 paddsw mm2, mm0
426 psraw mm1, 0x6
427 psraw mm2, 0x6
428 packuswb mm1, mm2
429 movntq [ebp], mm1
430 add ebp, 0x8
432 lscaleend:
433 cmp ebx, [esp + 32 + 20]
434 jl lscaleloop
435 popad
438 lscalelastpixel:
439 paddsw mm1, mm0
440 psraw mm1, 6
441 packuswb mm1, mm1
442 movd [ebp], mm1
443 popad
447 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
449 void FastConvertYUVToRGB32Row(const uint8* y_buf,
450 const uint8* u_buf,
451 const uint8* v_buf,
452 uint8* rgb_buf,
453 int width) {
454 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
455 if (mozilla::supports_sse()) {
456 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
457 return;
459 #endif
461 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
464 void ScaleYUVToRGB32Row(const uint8* y_buf,
465 const uint8* u_buf,
466 const uint8* v_buf,
467 uint8* rgb_buf,
468 int width,
469 int source_dx) {
471 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
472 if (mozilla::supports_sse()) {
473 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
474 return;
476 #endif
478 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
481 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
482 const uint8* u_buf,
483 const uint8* v_buf,
484 uint8* rgb_buf,
485 int width,
486 int source_dx) {
487 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
488 if (mozilla::supports_sse()) {
489 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
490 source_dx);
491 return;
493 #endif
495 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
498 } // extern "C"