Bumping manifests a=b2g-bump
[gecko.git] / gfx / ycbcr / yuv_row_posix.cpp
blobb359db4dd0eab9ca49b62b471b4407a37d3448c7
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "yuv_row.h"
6 #include "mozilla/SSE.h"
8 #define DCHECK(a)
10 extern "C" {
12 #if defined(ARCH_CPU_X86_64)
14 // We don't need CPUID guards here, since x86-64 implies SSE2.
16 // AMD64 ABI uses register paremters.
17 void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
18 const uint8* u_buf, // rsi
19 const uint8* v_buf, // rdx
20 uint8* rgb_buf, // rcx
21 int width) { // r8
22 asm(
23 "jmp 1f\n"
24 "0:"
25 "movzb (%1),%%r10\n"
26 "add $0x1,%1\n"
27 "movzb (%2),%%r11\n"
28 "add $0x1,%2\n"
29 "movq 2048(%5,%%r10,8),%%xmm0\n"
30 "movzb (%0),%%r10\n"
31 "movq 4096(%5,%%r11,8),%%xmm1\n"
32 "movzb 0x1(%0),%%r11\n"
33 "paddsw %%xmm1,%%xmm0\n"
34 "movq (%5,%%r10,8),%%xmm2\n"
35 "add $0x2,%0\n"
36 "movq (%5,%%r11,8),%%xmm3\n"
37 "paddsw %%xmm0,%%xmm2\n"
38 "paddsw %%xmm0,%%xmm3\n"
39 "shufps $0x44,%%xmm3,%%xmm2\n"
40 "psraw $0x6,%%xmm2\n"
41 "packuswb %%xmm2,%%xmm2\n"
42 "movq %%xmm2,0x0(%3)\n"
43 "add $0x8,%3\n"
44 "1:"
45 "sub $0x2,%4\n"
46 "jns 0b\n"
48 "2:"
49 "add $0x1,%4\n"
50 "js 3f\n"
52 "movzb (%1),%%r10\n"
53 "movq 2048(%5,%%r10,8),%%xmm0\n"
54 "movzb (%2),%%r10\n"
55 "movq 4096(%5,%%r10,8),%%xmm1\n"
56 "paddsw %%xmm1,%%xmm0\n"
57 "movzb (%0),%%r10\n"
58 "movq (%5,%%r10,8),%%xmm1\n"
59 "paddsw %%xmm0,%%xmm1\n"
60 "psraw $0x6,%%xmm1\n"
61 "packuswb %%xmm1,%%xmm1\n"
62 "movd %%xmm1,0x0(%3)\n"
63 "3:"
65 : "r"(y_buf), // %0
66 "r"(u_buf), // %1
67 "r"(v_buf), // %2
68 "r"(rgb_buf), // %3
69 "r"(width), // %4
70 "r" (kCoefficientsRgbY) // %5
71 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
75 void ScaleYUVToRGB32Row(const uint8* y_buf, // rdi
76 const uint8* u_buf, // rsi
77 const uint8* v_buf, // rdx
78 uint8* rgb_buf, // rcx
79 int width, // r8
80 int source_dx) { // r9
81 asm(
82 "xor %%r11,%%r11\n"
83 "sub $0x2,%4\n"
84 "js 1f\n"
86 "0:"
87 "mov %%r11,%%r10\n"
88 "sar $0x11,%%r10\n"
89 "movzb (%1,%%r10,1),%%rax\n"
90 "movq 2048(%5,%%rax,8),%%xmm0\n"
91 "movzb (%2,%%r10,1),%%rax\n"
92 "movq 4096(%5,%%rax,8),%%xmm1\n"
93 "lea (%%r11,%6),%%r10\n"
94 "sar $0x10,%%r11\n"
95 "movzb (%0,%%r11,1),%%rax\n"
96 "paddsw %%xmm1,%%xmm0\n"
97 "movq (%5,%%rax,8),%%xmm1\n"
98 "lea (%%r10,%6),%%r11\n"
99 "sar $0x10,%%r10\n"
100 "movzb (%0,%%r10,1),%%rax\n"
101 "movq (%5,%%rax,8),%%xmm2\n"
102 "paddsw %%xmm0,%%xmm1\n"
103 "paddsw %%xmm0,%%xmm2\n"
104 "shufps $0x44,%%xmm2,%%xmm1\n"
105 "psraw $0x6,%%xmm1\n"
106 "packuswb %%xmm1,%%xmm1\n"
107 "movq %%xmm1,0x0(%3)\n"
108 "add $0x8,%3\n"
109 "sub $0x2,%4\n"
110 "jns 0b\n"
112 "1:"
113 "add $0x1,%4\n"
114 "js 2f\n"
116 "mov %%r11,%%r10\n"
117 "sar $0x11,%%r10\n"
118 "movzb (%1,%%r10,1),%%rax\n"
119 "movq 2048(%5,%%rax,8),%%xmm0\n"
120 "movzb (%2,%%r10,1),%%rax\n"
121 "movq 4096(%5,%%rax,8),%%xmm1\n"
122 "paddsw %%xmm1,%%xmm0\n"
123 "sar $0x10,%%r11\n"
124 "movzb (%0,%%r11,1),%%rax\n"
125 "movq (%5,%%rax,8),%%xmm1\n"
126 "paddsw %%xmm0,%%xmm1\n"
127 "psraw $0x6,%%xmm1\n"
128 "packuswb %%xmm1,%%xmm1\n"
129 "movd %%xmm1,0x0(%3)\n"
131 "2:"
133 : "r"(y_buf), // %0
134 "r"(u_buf), // %1
135 "r"(v_buf), // %2
136 "r"(rgb_buf), // %3
137 "r"(width), // %4
138 "r" (kCoefficientsRgbY), // %5
139 "r"(static_cast<long>(source_dx)) // %6
140 : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
144 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
145 const uint8* u_buf,
146 const uint8* v_buf,
147 uint8* rgb_buf,
148 int width,
149 int source_dx) {
150 asm(
151 "xor %%r11,%%r11\n" // x = 0
152 "sub $0x2,%4\n"
153 "js 2f\n"
154 "cmp $0x20000,%6\n" // if source_dx >= 2.0
155 "jl 0f\n"
156 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
157 "0:"
159 "1:"
160 "mov %%r11,%%r10\n"
161 "sar $0x11,%%r10\n"
163 "movzb (%1, %%r10, 1), %%r13 \n"
164 "movzb 1(%1, %%r10, 1), %%r14 \n"
165 "mov %%r11, %%rax \n"
166 "and $0x1fffe, %%rax \n"
167 "imul %%rax, %%r14 \n"
168 "xor $0x1fffe, %%rax \n"
169 "imul %%rax, %%r13 \n"
170 "add %%r14, %%r13 \n"
171 "shr $17, %%r13 \n"
172 "movq 2048(%5,%%r13,8), %%xmm0\n"
174 "movzb (%2, %%r10, 1), %%r13 \n"
175 "movzb 1(%2, %%r10, 1), %%r14 \n"
176 "mov %%r11, %%rax \n"
177 "and $0x1fffe, %%rax \n"
178 "imul %%rax, %%r14 \n"
179 "xor $0x1fffe, %%rax \n"
180 "imul %%rax, %%r13 \n"
181 "add %%r14, %%r13 \n"
182 "shr $17, %%r13 \n"
183 "movq 4096(%5,%%r13,8), %%xmm1\n"
185 "mov %%r11, %%rax \n"
186 "lea (%%r11,%6),%%r10\n"
187 "sar $0x10,%%r11\n"
188 "paddsw %%xmm1,%%xmm0\n"
190 "movzb (%0, %%r11, 1), %%r13 \n"
191 "movzb 1(%0, %%r11, 1), %%r14 \n"
192 "and $0xffff, %%rax \n"
193 "imul %%rax, %%r14 \n"
194 "xor $0xffff, %%rax \n"
195 "imul %%rax, %%r13 \n"
196 "add %%r14, %%r13 \n"
197 "shr $16, %%r13 \n"
198 "movq (%5,%%r13,8),%%xmm1\n"
200 "mov %%r10, %%rax \n"
201 "lea (%%r10,%6),%%r11\n"
202 "sar $0x10,%%r10\n"
204 "movzb (%0,%%r10,1), %%r13 \n"
205 "movzb 1(%0,%%r10,1), %%r14 \n"
206 "and $0xffff, %%rax \n"
207 "imul %%rax, %%r14 \n"
208 "xor $0xffff, %%rax \n"
209 "imul %%rax, %%r13 \n"
210 "add %%r14, %%r13 \n"
211 "shr $16, %%r13 \n"
212 "movq (%5,%%r13,8),%%xmm2\n"
214 "paddsw %%xmm0,%%xmm1\n"
215 "paddsw %%xmm0,%%xmm2\n"
216 "shufps $0x44,%%xmm2,%%xmm1\n"
217 "psraw $0x6,%%xmm1\n"
218 "packuswb %%xmm1,%%xmm1\n"
219 "movq %%xmm1,0x0(%3)\n"
220 "add $0x8,%3\n"
221 "sub $0x2,%4\n"
222 "jns 1b\n"
224 "2:"
225 "add $0x1,%4\n"
226 "js 3f\n"
228 "mov %%r11,%%r10\n"
229 "sar $0x11,%%r10\n"
231 "movzb (%1,%%r10,1), %%r13 \n"
232 "movq 2048(%5,%%r13,8),%%xmm0\n"
234 "movzb (%2,%%r10,1), %%r13 \n"
235 "movq 4096(%5,%%r13,8),%%xmm1\n"
237 "paddsw %%xmm1,%%xmm0\n"
238 "sar $0x10,%%r11\n"
240 "movzb (%0,%%r11,1), %%r13 \n"
241 "movq (%5,%%r13,8),%%xmm1\n"
243 "paddsw %%xmm0,%%xmm1\n"
244 "psraw $0x6,%%xmm1\n"
245 "packuswb %%xmm1,%%xmm1\n"
246 "movd %%xmm1,0x0(%3)\n"
248 "3:"
250 : "r"(y_buf), // %0
251 "r"(u_buf), // %1
252 "r"(v_buf), // %2
253 "r"(rgb_buf), // %3
254 "r"(width), // %4
255 "r" (kCoefficientsRgbY), // %5
256 "r"(static_cast<long>(source_dx)) // %6
257 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
261 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
263 // PIC version is slower because less registers are available, so
264 // non-PIC is used on platforms where it is possible.
265 void FastConvertYUVToRGB32Row_SSE(const uint8* y_buf,
266 const uint8* u_buf,
267 const uint8* v_buf,
268 uint8* rgb_buf,
269 int width);
270 asm(
271 ".text\n"
272 ".global FastConvertYUVToRGB32Row_SSE\n"
273 ".type FastConvertYUVToRGB32Row_SSE, @function\n"
274 "FastConvertYUVToRGB32Row_SSE:\n"
275 "pusha\n"
276 "mov 0x24(%esp),%edx\n"
277 "mov 0x28(%esp),%edi\n"
278 "mov 0x2c(%esp),%esi\n"
279 "mov 0x30(%esp),%ebp\n"
280 "mov 0x34(%esp),%ecx\n"
281 "jmp 1f\n"
283 "0:"
284 "movzbl (%edi),%eax\n"
285 "add $0x1,%edi\n"
286 "movzbl (%esi),%ebx\n"
287 "add $0x1,%esi\n"
288 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
289 "movzbl (%edx),%eax\n"
290 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
291 "movzbl 0x1(%edx),%ebx\n"
292 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
293 "add $0x2,%edx\n"
294 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
295 "paddsw %mm0,%mm1\n"
296 "paddsw %mm0,%mm2\n"
297 "psraw $0x6,%mm1\n"
298 "psraw $0x6,%mm2\n"
299 "packuswb %mm2,%mm1\n"
300 "movntq %mm1,0x0(%ebp)\n"
301 "add $0x8,%ebp\n"
302 "1:"
303 "sub $0x2,%ecx\n"
304 "jns 0b\n"
306 "and $0x1,%ecx\n"
307 "je 2f\n"
309 "movzbl (%edi),%eax\n"
310 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
311 "movzbl (%esi),%eax\n"
312 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
313 "movzbl (%edx),%eax\n"
314 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
315 "paddsw %mm0,%mm1\n"
316 "psraw $0x6,%mm1\n"
317 "packuswb %mm1,%mm1\n"
318 "movd %mm1,0x0(%ebp)\n"
319 "2:"
320 "popa\n"
321 "ret\n"
322 #if !defined(XP_MACOSX)
323 ".previous\n"
324 #endif
327 void FastConvertYUVToRGB32Row(const uint8* y_buf,
328 const uint8* u_buf,
329 const uint8* v_buf,
330 uint8* rgb_buf,
331 int width)
333 if (mozilla::supports_sse()) {
334 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
335 return;
338 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
342 void ScaleYUVToRGB32Row_SSE(const uint8* y_buf,
343 const uint8* u_buf,
344 const uint8* v_buf,
345 uint8* rgb_buf,
346 int width,
347 int source_dx);
348 asm(
349 ".text\n"
350 ".global ScaleYUVToRGB32Row_SSE\n"
351 ".type ScaleYUVToRGB32Row_SSE, @function\n"
352 "ScaleYUVToRGB32Row_SSE:\n"
353 "pusha\n"
354 "mov 0x24(%esp),%edx\n"
355 "mov 0x28(%esp),%edi\n"
356 "mov 0x2c(%esp),%esi\n"
357 "mov 0x30(%esp),%ebp\n"
358 "mov 0x34(%esp),%ecx\n"
359 "xor %ebx,%ebx\n"
360 "jmp 1f\n"
362 "0:"
363 "mov %ebx,%eax\n"
364 "sar $0x11,%eax\n"
365 "movzbl (%edi,%eax,1),%eax\n"
366 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
367 "mov %ebx,%eax\n"
368 "sar $0x11,%eax\n"
369 "movzbl (%esi,%eax,1),%eax\n"
370 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
371 "mov %ebx,%eax\n"
372 "add 0x38(%esp),%ebx\n"
373 "sar $0x10,%eax\n"
374 "movzbl (%edx,%eax,1),%eax\n"
375 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
376 "mov %ebx,%eax\n"
377 "add 0x38(%esp),%ebx\n"
378 "sar $0x10,%eax\n"
379 "movzbl (%edx,%eax,1),%eax\n"
380 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
381 "paddsw %mm0,%mm1\n"
382 "paddsw %mm0,%mm2\n"
383 "psraw $0x6,%mm1\n"
384 "psraw $0x6,%mm2\n"
385 "packuswb %mm2,%mm1\n"
386 "movntq %mm1,0x0(%ebp)\n"
387 "add $0x8,%ebp\n"
388 "1:"
389 "sub $0x2,%ecx\n"
390 "jns 0b\n"
392 "and $0x1,%ecx\n"
393 "je 2f\n"
395 "mov %ebx,%eax\n"
396 "sar $0x11,%eax\n"
397 "movzbl (%edi,%eax,1),%eax\n"
398 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
399 "mov %ebx,%eax\n"
400 "sar $0x11,%eax\n"
401 "movzbl (%esi,%eax,1),%eax\n"
402 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
403 "mov %ebx,%eax\n"
404 "sar $0x10,%eax\n"
405 "movzbl (%edx,%eax,1),%eax\n"
406 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
407 "paddsw %mm0,%mm1\n"
408 "psraw $0x6,%mm1\n"
409 "packuswb %mm1,%mm1\n"
410 "movd %mm1,0x0(%ebp)\n"
412 "2:"
413 "popa\n"
414 "ret\n"
415 #if !defined(XP_MACOSX)
416 ".previous\n"
417 #endif
420 void ScaleYUVToRGB32Row(const uint8* y_buf,
421 const uint8* u_buf,
422 const uint8* v_buf,
423 uint8* rgb_buf,
424 int width,
425 int source_dx)
427 if (mozilla::supports_sse()) {
428 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
429 width, source_dx);
432 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
433 width, source_dx);
436 void LinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
437 const uint8* u_buf,
438 const uint8* v_buf,
439 uint8* rgb_buf,
440 int width,
441 int source_dx);
442 asm(
443 ".text\n"
444 ".global LinearScaleYUVToRGB32Row_SSE\n"
445 ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
446 "LinearScaleYUVToRGB32Row_SSE:\n"
447 "pusha\n"
448 "mov 0x24(%esp),%edx\n"
449 "mov 0x28(%esp),%edi\n"
450 "mov 0x30(%esp),%ebp\n"
452 // source_width = width * source_dx + ebx
453 "mov 0x34(%esp), %ecx\n"
454 "imull 0x38(%esp), %ecx\n"
455 "mov %ecx, 0x34(%esp)\n"
457 "mov 0x38(%esp), %ecx\n"
458 "xor %ebx,%ebx\n" // x = 0
459 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
460 "jl 1f\n"
461 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
462 "jmp 1f\n"
464 "0:"
465 "mov %ebx,%eax\n"
466 "sar $0x11,%eax\n"
468 "movzbl (%edi,%eax,1),%ecx\n"
469 "movzbl 1(%edi,%eax,1),%esi\n"
470 "mov %ebx,%eax\n"
471 "andl $0x1fffe, %eax \n"
472 "imul %eax, %esi \n"
473 "xorl $0x1fffe, %eax \n"
474 "imul %eax, %ecx \n"
475 "addl %esi, %ecx \n"
476 "shrl $17, %ecx \n"
477 "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
479 "mov 0x2c(%esp),%esi\n"
480 "mov %ebx,%eax\n"
481 "sar $0x11,%eax\n"
483 "movzbl (%esi,%eax,1),%ecx\n"
484 "movzbl 1(%esi,%eax,1),%esi\n"
485 "mov %ebx,%eax\n"
486 "andl $0x1fffe, %eax \n"
487 "imul %eax, %esi \n"
488 "xorl $0x1fffe, %eax \n"
489 "imul %eax, %ecx \n"
490 "addl %esi, %ecx \n"
491 "shrl $17, %ecx \n"
492 "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
494 "mov %ebx,%eax\n"
495 "sar $0x10,%eax\n"
496 "movzbl (%edx,%eax,1),%ecx\n"
497 "movzbl 1(%edx,%eax,1),%esi\n"
498 "mov %ebx,%eax\n"
499 "add 0x38(%esp),%ebx\n"
500 "andl $0xffff, %eax \n"
501 "imul %eax, %esi \n"
502 "xorl $0xffff, %eax \n"
503 "imul %eax, %ecx \n"
504 "addl %esi, %ecx \n"
505 "shrl $16, %ecx \n"
506 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
508 "cmp 0x34(%esp), %ebx\n"
509 "jge 2f\n"
511 "mov %ebx,%eax\n"
512 "sar $0x10,%eax\n"
513 "movzbl (%edx,%eax,1),%ecx\n"
514 "movzbl 1(%edx,%eax,1),%esi\n"
515 "mov %ebx,%eax\n"
516 "add 0x38(%esp),%ebx\n"
517 "andl $0xffff, %eax \n"
518 "imul %eax, %esi \n"
519 "xorl $0xffff, %eax \n"
520 "imul %eax, %ecx \n"
521 "addl %esi, %ecx \n"
522 "shrl $16, %ecx \n"
523 "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
525 "paddsw %mm0,%mm1\n"
526 "paddsw %mm0,%mm2\n"
527 "psraw $0x6,%mm1\n"
528 "psraw $0x6,%mm2\n"
529 "packuswb %mm2,%mm1\n"
530 "movntq %mm1,0x0(%ebp)\n"
531 "add $0x8,%ebp\n"
533 "1:"
534 "cmp 0x34(%esp), %ebx\n"
535 "jl 0b\n"
536 "popa\n"
537 "ret\n"
539 "2:"
540 "paddsw %mm0, %mm1\n"
541 "psraw $6, %mm1\n"
542 "packuswb %mm1, %mm1\n"
543 "movd %mm1, (%ebp)\n"
544 "popa\n"
545 "ret\n"
546 #if !defined(XP_MACOSX)
547 ".previous\n"
548 #endif
551 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
552 const uint8* u_buf,
553 const uint8* v_buf,
554 uint8* rgb_buf,
555 int width,
556 int source_dx)
558 if (mozilla::supports_sse()) {
559 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
560 width, source_dx);
563 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
564 width, source_dx);
567 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
569 void PICConvertYUVToRGB32Row_SSE(const uint8* y_buf,
570 const uint8* u_buf,
571 const uint8* v_buf,
572 uint8* rgb_buf,
573 int width,
574 int16 *kCoefficientsRgbY);
576 asm(
577 ".text\n"
578 #if defined(XP_MACOSX)
579 "_PICConvertYUVToRGB32Row_SSE:\n"
580 #else
581 "PICConvertYUVToRGB32Row_SSE:\n"
582 #endif
583 "pusha\n"
584 "mov 0x24(%esp),%edx\n"
585 "mov 0x28(%esp),%edi\n"
586 "mov 0x2c(%esp),%esi\n"
587 "mov 0x30(%esp),%ebp\n"
588 "mov 0x38(%esp),%ecx\n"
590 "jmp 1f\n"
592 "0:"
593 "movzbl (%edi),%eax\n"
594 "add $0x1,%edi\n"
595 "movzbl (%esi),%ebx\n"
596 "add $0x1,%esi\n"
597 "movq 2048(%ecx,%eax,8),%mm0\n"
598 "movzbl (%edx),%eax\n"
599 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
600 "movzbl 0x1(%edx),%ebx\n"
601 "movq 0(%ecx,%eax,8),%mm1\n"
602 "add $0x2,%edx\n"
603 "movq 0(%ecx,%ebx,8),%mm2\n"
604 "paddsw %mm0,%mm1\n"
605 "paddsw %mm0,%mm2\n"
606 "psraw $0x6,%mm1\n"
607 "psraw $0x6,%mm2\n"
608 "packuswb %mm2,%mm1\n"
609 "movntq %mm1,0x0(%ebp)\n"
610 "add $0x8,%ebp\n"
611 "1:"
612 "subl $0x2,0x34(%esp)\n"
613 "jns 0b\n"
615 "andl $0x1,0x34(%esp)\n"
616 "je 2f\n"
618 "movzbl (%edi),%eax\n"
619 "movq 2048(%ecx,%eax,8),%mm0\n"
620 "movzbl (%esi),%eax\n"
621 "paddsw 4096(%ecx,%eax,8),%mm0\n"
622 "movzbl (%edx),%eax\n"
623 "movq 0(%ecx,%eax,8),%mm1\n"
624 "paddsw %mm0,%mm1\n"
625 "psraw $0x6,%mm1\n"
626 "packuswb %mm1,%mm1\n"
627 "movd %mm1,0x0(%ebp)\n"
628 "2:"
629 "popa\n"
630 "ret\n"
631 #if !defined(XP_MACOSX)
632 ".previous\n"
633 #endif
636 void FastConvertYUVToRGB32Row(const uint8* y_buf,
637 const uint8* u_buf,
638 const uint8* v_buf,
639 uint8* rgb_buf,
640 int width)
642 if (mozilla::supports_sse()) {
643 PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
644 &kCoefficientsRgbY[0][0]);
645 return;
648 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
651 void PICScaleYUVToRGB32Row_SSE(const uint8* y_buf,
652 const uint8* u_buf,
653 const uint8* v_buf,
654 uint8* rgb_buf,
655 int width,
656 int source_dx,
657 int16 *kCoefficientsRgbY);
659 asm(
660 ".text\n"
661 #if defined(XP_MACOSX)
662 "_PICScaleYUVToRGB32Row_SSE:\n"
663 #else
664 "PICScaleYUVToRGB32Row_SSE:\n"
665 #endif
666 "pusha\n"
667 "mov 0x24(%esp),%edx\n"
668 "mov 0x28(%esp),%edi\n"
669 "mov 0x2c(%esp),%esi\n"
670 "mov 0x30(%esp),%ebp\n"
671 "mov 0x3c(%esp),%ecx\n"
672 "xor %ebx,%ebx\n"
673 "jmp 1f\n"
675 "0:"
676 "mov %ebx,%eax\n"
677 "sar $0x11,%eax\n"
678 "movzbl (%edi,%eax,1),%eax\n"
679 "movq 2048(%ecx,%eax,8),%mm0\n"
680 "mov %ebx,%eax\n"
681 "sar $0x11,%eax\n"
682 "movzbl (%esi,%eax,1),%eax\n"
683 "paddsw 4096(%ecx,%eax,8),%mm0\n"
684 "mov %ebx,%eax\n"
685 "add 0x38(%esp),%ebx\n"
686 "sar $0x10,%eax\n"
687 "movzbl (%edx,%eax,1),%eax\n"
688 "movq 0(%ecx,%eax,8),%mm1\n"
689 "mov %ebx,%eax\n"
690 "add 0x38(%esp),%ebx\n"
691 "sar $0x10,%eax\n"
692 "movzbl (%edx,%eax,1),%eax\n"
693 "movq 0(%ecx,%eax,8),%mm2\n"
694 "paddsw %mm0,%mm1\n"
695 "paddsw %mm0,%mm2\n"
696 "psraw $0x6,%mm1\n"
697 "psraw $0x6,%mm2\n"
698 "packuswb %mm2,%mm1\n"
699 "movntq %mm1,0x0(%ebp)\n"
700 "add $0x8,%ebp\n"
701 "1:"
702 "subl $0x2,0x34(%esp)\n"
703 "jns 0b\n"
705 "andl $0x1,0x34(%esp)\n"
706 "je 2f\n"
708 "mov %ebx,%eax\n"
709 "sar $0x11,%eax\n"
710 "movzbl (%edi,%eax,1),%eax\n"
711 "movq 2048(%ecx,%eax,8),%mm0\n"
712 "mov %ebx,%eax\n"
713 "sar $0x11,%eax\n"
714 "movzbl (%esi,%eax,1),%eax\n"
715 "paddsw 4096(%ecx,%eax,8),%mm0\n"
716 "mov %ebx,%eax\n"
717 "sar $0x10,%eax\n"
718 "movzbl (%edx,%eax,1),%eax\n"
719 "movq 0(%ecx,%eax,8),%mm1\n"
720 "paddsw %mm0,%mm1\n"
721 "psraw $0x6,%mm1\n"
722 "packuswb %mm1,%mm1\n"
723 "movd %mm1,0x0(%ebp)\n"
725 "2:"
726 "popa\n"
727 "ret\n"
728 #if !defined(XP_MACOSX)
729 ".previous\n"
730 #endif
733 void ScaleYUVToRGB32Row(const uint8* y_buf,
734 const uint8* u_buf,
735 const uint8* v_buf,
736 uint8* rgb_buf,
737 int width,
738 int source_dx)
740 if (mozilla::supports_sse()) {
741 PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
742 &kCoefficientsRgbY[0][0]);
743 return;
746 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
749 void PICLinearScaleYUVToRGB32Row_SSE(const uint8* y_buf,
750 const uint8* u_buf,
751 const uint8* v_buf,
752 uint8* rgb_buf,
753 int width,
754 int source_dx,
755 int16 *kCoefficientsRgbY);
757 asm(
758 ".text\n"
759 #if defined(XP_MACOSX)
760 "_PICLinearScaleYUVToRGB32Row_SSE:\n"
761 #else
762 "PICLinearScaleYUVToRGB32Row_SSE:\n"
763 #endif
764 "pusha\n"
765 "mov 0x24(%esp),%edx\n"
766 "mov 0x30(%esp),%ebp\n"
767 "mov 0x34(%esp),%ecx\n"
768 "mov 0x3c(%esp),%edi\n"
769 "xor %ebx,%ebx\n"
771 // source_width = width * source_dx + ebx
772 "mov 0x34(%esp), %ecx\n"
773 "imull 0x38(%esp), %ecx\n"
774 "mov %ecx, 0x34(%esp)\n"
776 "mov 0x38(%esp), %ecx\n"
777 "xor %ebx,%ebx\n" // x = 0
778 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
779 "jl 1f\n"
780 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
781 "jmp 1f\n"
783 "0:"
784 "mov 0x28(%esp),%esi\n"
785 "mov %ebx,%eax\n"
786 "sar $0x11,%eax\n"
788 "movzbl (%esi,%eax,1),%ecx\n"
789 "movzbl 1(%esi,%eax,1),%esi\n"
790 "mov %ebx,%eax\n"
791 "andl $0x1fffe, %eax \n"
792 "imul %eax, %esi \n"
793 "xorl $0x1fffe, %eax \n"
794 "imul %eax, %ecx \n"
795 "addl %esi, %ecx \n"
796 "shrl $17, %ecx \n"
797 "movq 2048(%edi,%ecx,8),%mm0\n"
799 "mov 0x2c(%esp),%esi\n"
800 "mov %ebx,%eax\n"
801 "sar $0x11,%eax\n"
803 "movzbl (%esi,%eax,1),%ecx\n"
804 "movzbl 1(%esi,%eax,1),%esi\n"
805 "mov %ebx,%eax\n"
806 "andl $0x1fffe, %eax \n"
807 "imul %eax, %esi \n"
808 "xorl $0x1fffe, %eax \n"
809 "imul %eax, %ecx \n"
810 "addl %esi, %ecx \n"
811 "shrl $17, %ecx \n"
812 "paddsw 4096(%edi,%ecx,8),%mm0\n"
814 "mov %ebx,%eax\n"
815 "sar $0x10,%eax\n"
816 "movzbl (%edx,%eax,1),%ecx\n"
817 "movzbl 1(%edx,%eax,1),%esi\n"
818 "mov %ebx,%eax\n"
819 "add 0x38(%esp),%ebx\n"
820 "andl $0xffff, %eax \n"
821 "imul %eax, %esi \n"
822 "xorl $0xffff, %eax \n"
823 "imul %eax, %ecx \n"
824 "addl %esi, %ecx \n"
825 "shrl $16, %ecx \n"
826 "movq (%edi,%ecx,8),%mm1\n"
828 "cmp 0x34(%esp), %ebx\n"
829 "jge 2f\n"
831 "mov %ebx,%eax\n"
832 "sar $0x10,%eax\n"
833 "movzbl (%edx,%eax,1),%ecx\n"
834 "movzbl 1(%edx,%eax,1),%esi\n"
835 "mov %ebx,%eax\n"
836 "add 0x38(%esp),%ebx\n"
837 "andl $0xffff, %eax \n"
838 "imul %eax, %esi \n"
839 "xorl $0xffff, %eax \n"
840 "imul %eax, %ecx \n"
841 "addl %esi, %ecx \n"
842 "shrl $16, %ecx \n"
843 "movq (%edi,%ecx,8),%mm2\n"
845 "paddsw %mm0,%mm1\n"
846 "paddsw %mm0,%mm2\n"
847 "psraw $0x6,%mm1\n"
848 "psraw $0x6,%mm2\n"
849 "packuswb %mm2,%mm1\n"
850 "movntq %mm1,0x0(%ebp)\n"
851 "add $0x8,%ebp\n"
853 "1:"
854 "cmp %ebx, 0x34(%esp)\n"
855 "jg 0b\n"
856 "popa\n"
857 "ret\n"
859 "2:"
860 "paddsw %mm0, %mm1\n"
861 "psraw $6, %mm1\n"
862 "packuswb %mm1, %mm1\n"
863 "movd %mm1, (%ebp)\n"
864 "popa\n"
865 "ret\n"
866 #if !defined(XP_MACOSX)
867 ".previous\n"
868 #endif
872 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
873 const uint8* u_buf,
874 const uint8* v_buf,
875 uint8* rgb_buf,
876 int width,
877 int source_dx)
879 if (mozilla::supports_sse()) {
880 PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
881 source_dx, &kCoefficientsRgbY[0][0]);
882 return;
885 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
887 #else
888 void FastConvertYUVToRGB32Row(const uint8* y_buf,
889 const uint8* u_buf,
890 const uint8* v_buf,
891 uint8* rgb_buf,
892 int width) {
893 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
896 void ScaleYUVToRGB32Row(const uint8* y_buf,
897 const uint8* u_buf,
898 const uint8* v_buf,
899 uint8* rgb_buf,
900 int width,
901 int source_dx) {
902 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
905 void LinearScaleYUVToRGB32Row(const uint8* y_buf,
906 const uint8* u_buf,
907 const uint8* v_buf,
908 uint8* rgb_buf,
909 int width,
910 int source_dx) {
911 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
913 #endif