Bug 1856663 - Add more chunks for Android mochitest-plain. r=jmaher,taskgraph-reviewe...
[gecko.git] / gfx / ycbcr / yuv_row_win.cpp
blobc496b2d935ca3817dd6c41dabb130293129e89a0
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "yuv_row.h"
6 #include "mozilla/SSE.h"
8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
11 extern "C" {
13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
14 #if defined(__clang__)
15 // clang-cl has a bug where it doesn't mangle names in inline asm
16 // so let's do the mangling in the preprocessor (ugh)
17 // (but we still need to declare a dummy extern for the parser)
18 extern void* _kCoefficientsRgbY;
19 #define kCoefficientsRgbY _kCoefficientsRgbY
20 #endif
22 __declspec(naked)
23 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
24 const uint8_t* u_buf,
25 const uint8_t* v_buf,
26 uint8_t* rgb_buf,
27 int width) {
28 __asm {
29 pushad
30 mov edx, [esp + 32 + 4] // Y
31 mov edi, [esp + 32 + 8] // U
32 mov esi, [esp + 32 + 12] // V
33 mov ebp, [esp + 32 + 16] // rgb
34 mov ecx, [esp + 32 + 20] // width
35 jmp convertend
37 convertloop :
38 movzx eax, byte ptr [edi]
39 add edi, 1
40 movzx ebx, byte ptr [esi]
41 add esi, 1
42 movq mm0, [kCoefficientsRgbU + 8 * eax]
43 movzx eax, byte ptr [edx]
44 paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
45 movzx ebx, byte ptr [edx + 1]
46 movq mm1, [kCoefficientsRgbY + 8 * eax]
47 add edx, 2
48 movq mm2, [kCoefficientsRgbY + 8 * ebx]
49 paddsw mm1, mm0
50 paddsw mm2, mm0
51 psraw mm1, 6
52 psraw mm2, 6
53 packuswb mm1, mm2
54 movntq [ebp], mm1
55 add ebp, 8
56 convertend :
57 sub ecx, 2
58 jns convertloop
60 and ecx, 1 // odd number of pixels?
61 jz convertdone
63 movzx eax, byte ptr [edi]
64 movq mm0, [kCoefficientsRgbU + 8 * eax]
65 movzx eax, byte ptr [esi]
66 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
67 movzx eax, byte ptr [edx]
68 movq mm1, [kCoefficientsRgbY + 8 * eax]
69 paddsw mm1, mm0
70 psraw mm1, 6
71 packuswb mm1, mm1
72 movd [ebp], mm1
73 convertdone :
75 popad
76 ret
80 __declspec(naked)
81 void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
82 const uint8_t* u_buf,
83 const uint8_t* v_buf,
84 uint8_t* rgb_buf,
85 int width,
86 int step) {
87 __asm {
88 pushad
89 mov edx, [esp + 32 + 4] // Y
90 mov edi, [esp + 32 + 8] // U
91 mov esi, [esp + 32 + 12] // V
92 mov ebp, [esp + 32 + 16] // rgb
93 mov ecx, [esp + 32 + 20] // width
94 mov ebx, [esp + 32 + 24] // step
95 jmp wend
97 wloop :
98 movzx eax, byte ptr [edi]
99 add edi, ebx
100 movq mm0, [kCoefficientsRgbU + 8 * eax]
101 movzx eax, byte ptr [esi]
102 add esi, ebx
103 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
104 movzx eax, byte ptr [edx]
105 add edx, ebx
106 movq mm1, [kCoefficientsRgbY + 8 * eax]
107 movzx eax, byte ptr [edx]
108 add edx, ebx
109 movq mm2, [kCoefficientsRgbY + 8 * eax]
110 paddsw mm1, mm0
111 paddsw mm2, mm0
112 psraw mm1, 6
113 psraw mm2, 6
114 packuswb mm1, mm2
115 movntq [ebp], mm1
116 add ebp, 8
117 wend :
118 sub ecx, 2
119 jns wloop
121 and ecx, 1 // odd number of pixels?
122 jz wdone
124 movzx eax, byte ptr [edi]
125 movq mm0, [kCoefficientsRgbU + 8 * eax]
126 movzx eax, byte ptr [esi]
127 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
128 movzx eax, byte ptr [edx]
129 movq mm1, [kCoefficientsRgbY + 8 * eax]
130 paddsw mm1, mm0
131 psraw mm1, 6
132 packuswb mm1, mm1
133 movd [ebp], mm1
134 wdone :
136 popad
141 __declspec(naked)
142 void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
143 const uint8_t* u_buf,
144 const uint8_t* v_buf,
145 uint8_t* rgb_buf,
146 int width,
147 int ystep,
148 int uvstep) {
149 __asm {
150 pushad
151 mov edx, [esp + 32 + 4] // Y
152 mov edi, [esp + 32 + 8] // U
153 mov esi, [esp + 32 + 12] // V
154 mov ebp, [esp + 32 + 16] // rgb
155 mov ecx, [esp + 32 + 20] // width
156 jmp wend
158 wloop :
159 movzx eax, byte ptr [edi]
160 mov ebx, [esp + 32 + 28] // uvstep
161 add edi, ebx
162 movq mm0, [kCoefficientsRgbU + 8 * eax]
163 movzx eax, byte ptr [esi]
164 add esi, ebx
165 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
166 movzx eax, byte ptr [edx]
167 mov ebx, [esp + 32 + 24] // ystep
168 add edx, ebx
169 movq mm1, [kCoefficientsRgbY + 8 * eax]
170 movzx eax, byte ptr [edx]
171 add edx, ebx
172 movq mm2, [kCoefficientsRgbY + 8 * eax]
173 paddsw mm1, mm0
174 paddsw mm2, mm0
175 psraw mm1, 6
176 psraw mm2, 6
177 packuswb mm1, mm2
178 movntq [ebp], mm1
179 add ebp, 8
180 wend :
181 sub ecx, 2
182 jns wloop
184 and ecx, 1 // odd number of pixels?
185 jz wdone
187 movzx eax, byte ptr [edi]
188 movq mm0, [kCoefficientsRgbU + 8 * eax]
189 movzx eax, byte ptr [esi]
190 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
191 movzx eax, byte ptr [edx]
192 movq mm1, [kCoefficientsRgbY + 8 * eax]
193 paddsw mm1, mm0
194 psraw mm1, 6
195 packuswb mm1, mm1
196 movd [ebp], mm1
197 wdone :
199 popad
204 __declspec(naked)
205 void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf,
206 const uint8_t* u_buf,
207 const uint8_t* v_buf,
208 uint8_t* rgb_buf,
209 int width) {
210 __asm {
211 pushad
212 mov edx, [esp + 32 + 4] // Y
213 mov edi, [esp + 32 + 8] // U
214 mov esi, [esp + 32 + 12] // V
215 mov ebp, [esp + 32 + 16] // rgb
216 mov ecx, [esp + 32 + 20] // width
217 jmp wend
219 wloop :
220 movzx eax, byte ptr [edi]
221 add edi, 1
222 movzx ebx, byte ptr [esi]
223 add esi, 1
224 movq mm0, [kCoefficientsRgbU + 8 * eax]
225 movzx eax, byte ptr [edx]
226 paddsw mm0, [kCoefficientsRgbV + 8 * ebx]
227 movq mm1, [kCoefficientsRgbY + 8 * eax]
228 paddsw mm1, mm0
229 psraw mm1, 6
230 packuswb mm1, mm1
231 punpckldq mm1, mm1
232 movntq [ebp], mm1
234 movzx ebx, byte ptr [edx + 1]
235 add edx, 2
236 paddsw mm0, [kCoefficientsRgbY + 8 * ebx]
237 psraw mm0, 6
238 packuswb mm0, mm0
239 punpckldq mm0, mm0
240 movntq [ebp+8], mm0
241 add ebp, 16
242 wend :
243 sub ecx, 4
244 jns wloop
246 add ecx, 4
247 jz wdone
249 movzx eax, byte ptr [edi]
250 movq mm0, [kCoefficientsRgbU + 8 * eax]
251 movzx eax, byte ptr [esi]
252 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
253 movzx eax, byte ptr [edx]
254 movq mm1, [kCoefficientsRgbY + 8 * eax]
255 paddsw mm1, mm0
256 psraw mm1, 6
257 packuswb mm1, mm1
258 jmp wend1
260 wloop1 :
261 movd [ebp], mm1
262 add ebp, 4
263 wend1 :
264 sub ecx, 1
265 jns wloop1
266 wdone :
267 popad
272 // This version does general purpose scaling by any amount, up or down.
273 // The only thing it cannot do is rotation by 90 or 270.
274 // For performance the chroma is under-sampled, reducing cost of a 3x
275 // 1080p scale from 8.4 ms to 5.4 ms.
276 __declspec(naked)
277 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
278 const uint8_t* u_buf,
279 const uint8_t* v_buf,
280 uint8_t* rgb_buf,
281 int width,
282 int source_dx) {
283 __asm {
284 pushad
285 mov edx, [esp + 32 + 4] // Y
286 mov edi, [esp + 32 + 8] // U
287 mov esi, [esp + 32 + 12] // V
288 mov ebp, [esp + 32 + 16] // rgb
289 mov ecx, [esp + 32 + 20] // width
290 xor ebx, ebx // x
291 jmp scaleend
293 scaleloop :
294 mov eax, ebx
295 sar eax, 17
296 movzx eax, byte ptr [edi + eax]
297 movq mm0, [kCoefficientsRgbU + 8 * eax]
298 mov eax, ebx
299 sar eax, 17
300 movzx eax, byte ptr [esi + eax]
301 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
302 mov eax, ebx
303 add ebx, [esp + 32 + 24] // x += source_dx
304 sar eax, 16
305 movzx eax, byte ptr [edx + eax]
306 movq mm1, [kCoefficientsRgbY + 8 * eax]
307 mov eax, ebx
308 add ebx, [esp + 32 + 24] // x += source_dx
309 sar eax, 16
310 movzx eax, byte ptr [edx + eax]
311 movq mm2, [kCoefficientsRgbY + 8 * eax]
312 paddsw mm1, mm0
313 paddsw mm2, mm0
314 psraw mm1, 6
315 psraw mm2, 6
316 packuswb mm1, mm2
317 movntq [ebp], mm1
318 add ebp, 8
319 scaleend :
320 sub ecx, 2
321 jns scaleloop
323 and ecx, 1 // odd number of pixels?
324 jz scaledone
326 mov eax, ebx
327 sar eax, 17
328 movzx eax, byte ptr [edi + eax]
329 movq mm0, [kCoefficientsRgbU + 8 * eax]
330 mov eax, ebx
331 sar eax, 17
332 movzx eax, byte ptr [esi + eax]
333 paddsw mm0, [kCoefficientsRgbV + 8 * eax]
334 mov eax, ebx
335 sar eax, 16
336 movzx eax, byte ptr [edx + eax]
337 movq mm1, [kCoefficientsRgbY + 8 * eax]
338 paddsw mm1, mm0
339 psraw mm1, 6
340 packuswb mm1, mm1
341 movd [ebp], mm1
343 scaledone :
344 popad
349 __declspec(naked)
350 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
351 const uint8_t* u_buf,
352 const uint8_t* v_buf,
353 uint8_t* rgb_buf,
354 int width,
355 int source_dx) {
356 __asm {
357 pushad
358 mov edx, [esp + 32 + 4] // Y
359 mov edi, [esp + 32 + 8] // U
360 // [esp + 32 + 12] // V
361 mov ebp, [esp + 32 + 16] // rgb
362 mov ecx, [esp + 32 + 20] // width
363 imul ecx, [esp + 32 + 24] // source_dx
364 mov [esp + 32 + 20], ecx // source_width = width * source_dx
365 mov ecx, [esp + 32 + 24] // source_dx
366 xor ebx, ebx // x = 0
367 cmp ecx, 0x20000
368 jl lscaleend
369 mov ebx, 0x8000 // x = 0.5 for 1/2 or less
370 jmp lscaleend
371 lscaleloop:
372 mov eax, ebx
373 sar eax, 0x11
375 movzx ecx, byte ptr [edi + eax]
376 movzx esi, byte ptr [edi + eax + 1]
377 mov eax, ebx
378 and eax, 0x1fffe
379 imul esi, eax
380 xor eax, 0x1fffe
381 imul ecx, eax
382 add ecx, esi
383 shr ecx, 17
384 movq mm0, [kCoefficientsRgbU + 8 * ecx]
386 mov esi, [esp + 32 + 12]
387 mov eax, ebx
388 sar eax, 0x11
390 movzx ecx, byte ptr [esi + eax]
391 movzx esi, byte ptr [esi + eax + 1]
392 mov eax, ebx
393 and eax, 0x1fffe
394 imul esi, eax
395 xor eax, 0x1fffe
396 imul ecx, eax
397 add ecx, esi
398 shr ecx, 17
399 paddsw mm0, [kCoefficientsRgbV + 8 * ecx]
401 mov eax, ebx
402 sar eax, 0x10
403 movzx ecx, byte ptr [edx + eax]
404 movzx esi, byte ptr [1 + edx + eax]
405 mov eax, ebx
406 add ebx, [esp + 32 + 24]
407 and eax, 0xffff
408 imul esi, eax
409 xor eax, 0xffff
410 imul ecx, eax
411 add ecx, esi
412 shr ecx, 16
413 movq mm1, [kCoefficientsRgbY + 8 * ecx]
415 cmp ebx, [esp + 32 + 20]
416 jge lscalelastpixel
418 mov eax, ebx
419 sar eax, 0x10
420 movzx ecx, byte ptr [edx + eax]
421 movzx esi, byte ptr [edx + eax + 1]
422 mov eax, ebx
423 add ebx, [esp + 32 + 24]
424 and eax, 0xffff
425 imul esi, eax
426 xor eax, 0xffff
427 imul ecx, eax
428 add ecx, esi
429 shr ecx, 16
430 movq mm2, [kCoefficientsRgbY + 8 * ecx]
432 paddsw mm1, mm0
433 paddsw mm2, mm0
434 psraw mm1, 0x6
435 psraw mm2, 0x6
436 packuswb mm1, mm2
437 movntq [ebp], mm1
438 add ebp, 0x8
440 lscaleend:
441 cmp ebx, [esp + 32 + 20]
442 jl lscaleloop
443 popad
446 lscalelastpixel:
447 paddsw mm1, mm0
448 psraw mm1, 6
449 packuswb mm1, mm1
450 movd [ebp], mm1
451 popad
455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
457 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
458 const uint8_t* u_buf,
459 const uint8_t* v_buf,
460 uint8_t* rgb_buf,
461 int width) {
462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
463 if (mozilla::supports_sse()) {
464 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
465 return;
467 #endif
469 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
472 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
473 const uint8_t* u_buf,
474 const uint8_t* v_buf,
475 uint8_t* rgb_buf,
476 int width,
477 int source_dx) {
479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
480 if (mozilla::supports_sse()) {
481 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
482 return;
484 #endif
486 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
489 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
490 const uint8_t* u_buf,
491 const uint8_t* v_buf,
492 uint8_t* rgb_buf,
493 int width,
494 int source_dx) {
495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
496 if (mozilla::supports_sse()) {
497 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
498 source_dx);
499 return;
501 #endif
503 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
506 } // extern "C"