1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
6 #include "mozilla/SSE.h"
12 #if defined(ARCH_CPU_X86_64)
14 // We don't need CPUID guards here, since x86-64 implies SSE2.
16 // AMD64 ABI uses register paremters.
17 void FastConvertYUVToRGB32Row(const uint8
* y_buf
, // rdi
18 const uint8
* u_buf
, // rsi
19 const uint8
* v_buf
, // rdx
20 uint8
* rgb_buf
, // rcx
29 "movq 2048(%5,%%r10,8),%%xmm0\n"
31 "movq 4096(%5,%%r11,8),%%xmm1\n"
32 "movzb 0x1(%0),%%r11\n"
33 "paddsw %%xmm1,%%xmm0\n"
34 "movq (%5,%%r10,8),%%xmm2\n"
36 "movq (%5,%%r11,8),%%xmm3\n"
37 "paddsw %%xmm0,%%xmm2\n"
38 "paddsw %%xmm0,%%xmm3\n"
39 "shufps $0x44,%%xmm3,%%xmm2\n"
41 "packuswb %%xmm2,%%xmm2\n"
42 "movq %%xmm2,0x0(%3)\n"
53 "movq 2048(%5,%%r10,8),%%xmm0\n"
55 "movq 4096(%5,%%r10,8),%%xmm1\n"
56 "paddsw %%xmm1,%%xmm0\n"
58 "movq (%5,%%r10,8),%%xmm1\n"
59 "paddsw %%xmm0,%%xmm1\n"
61 "packuswb %%xmm1,%%xmm1\n"
62 "movd %%xmm1,0x0(%3)\n"
70 "r" (kCoefficientsRgbY
) // %5
71 : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
75 void ScaleYUVToRGB32Row(const uint8
* y_buf
, // rdi
76 const uint8
* u_buf
, // rsi
77 const uint8
* v_buf
, // rdx
78 uint8
* rgb_buf
, // rcx
80 int source_dx
) { // r9
89 "movzb (%1,%%r10,1),%%rax\n"
90 "movq 2048(%5,%%rax,8),%%xmm0\n"
91 "movzb (%2,%%r10,1),%%rax\n"
92 "movq 4096(%5,%%rax,8),%%xmm1\n"
93 "lea (%%r11,%6),%%r10\n"
95 "movzb (%0,%%r11,1),%%rax\n"
96 "paddsw %%xmm1,%%xmm0\n"
97 "movq (%5,%%rax,8),%%xmm1\n"
98 "lea (%%r10,%6),%%r11\n"
100 "movzb (%0,%%r10,1),%%rax\n"
101 "movq (%5,%%rax,8),%%xmm2\n"
102 "paddsw %%xmm0,%%xmm1\n"
103 "paddsw %%xmm0,%%xmm2\n"
104 "shufps $0x44,%%xmm2,%%xmm1\n"
105 "psraw $0x6,%%xmm1\n"
106 "packuswb %%xmm1,%%xmm1\n"
107 "movq %%xmm1,0x0(%3)\n"
118 "movzb (%1,%%r10,1),%%rax\n"
119 "movq 2048(%5,%%rax,8),%%xmm0\n"
120 "movzb (%2,%%r10,1),%%rax\n"
121 "movq 4096(%5,%%rax,8),%%xmm1\n"
122 "paddsw %%xmm1,%%xmm0\n"
124 "movzb (%0,%%r11,1),%%rax\n"
125 "movq (%5,%%rax,8),%%xmm1\n"
126 "paddsw %%xmm0,%%xmm1\n"
127 "psraw $0x6,%%xmm1\n"
128 "packuswb %%xmm1,%%xmm1\n"
129 "movd %%xmm1,0x0(%3)\n"
138 "r" (kCoefficientsRgbY
), // %5
139 "r"(static_cast<long>(source_dx
)) // %6
140 : "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
144 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
151 "xor %%r11,%%r11\n" // x = 0
154 "cmp $0x20000,%6\n" // if source_dx >= 2.0
156 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
163 "movzb (%1, %%r10, 1), %%r13 \n"
164 "movzb 1(%1, %%r10, 1), %%r14 \n"
165 "mov %%r11, %%rax \n"
166 "and $0x1fffe, %%rax \n"
167 "imul %%rax, %%r14 \n"
168 "xor $0x1fffe, %%rax \n"
169 "imul %%rax, %%r13 \n"
170 "add %%r14, %%r13 \n"
172 "movq 2048(%5,%%r13,8), %%xmm0\n"
174 "movzb (%2, %%r10, 1), %%r13 \n"
175 "movzb 1(%2, %%r10, 1), %%r14 \n"
176 "mov %%r11, %%rax \n"
177 "and $0x1fffe, %%rax \n"
178 "imul %%rax, %%r14 \n"
179 "xor $0x1fffe, %%rax \n"
180 "imul %%rax, %%r13 \n"
181 "add %%r14, %%r13 \n"
183 "movq 4096(%5,%%r13,8), %%xmm1\n"
185 "mov %%r11, %%rax \n"
186 "lea (%%r11,%6),%%r10\n"
188 "paddsw %%xmm1,%%xmm0\n"
190 "movzb (%0, %%r11, 1), %%r13 \n"
191 "movzb 1(%0, %%r11, 1), %%r14 \n"
192 "and $0xffff, %%rax \n"
193 "imul %%rax, %%r14 \n"
194 "xor $0xffff, %%rax \n"
195 "imul %%rax, %%r13 \n"
196 "add %%r14, %%r13 \n"
198 "movq (%5,%%r13,8),%%xmm1\n"
200 "mov %%r10, %%rax \n"
201 "lea (%%r10,%6),%%r11\n"
204 "movzb (%0,%%r10,1), %%r13 \n"
205 "movzb 1(%0,%%r10,1), %%r14 \n"
206 "and $0xffff, %%rax \n"
207 "imul %%rax, %%r14 \n"
208 "xor $0xffff, %%rax \n"
209 "imul %%rax, %%r13 \n"
210 "add %%r14, %%r13 \n"
212 "movq (%5,%%r13,8),%%xmm2\n"
214 "paddsw %%xmm0,%%xmm1\n"
215 "paddsw %%xmm0,%%xmm2\n"
216 "shufps $0x44,%%xmm2,%%xmm1\n"
217 "psraw $0x6,%%xmm1\n"
218 "packuswb %%xmm1,%%xmm1\n"
219 "movq %%xmm1,0x0(%3)\n"
231 "movzb (%1,%%r10,1), %%r13 \n"
232 "movq 2048(%5,%%r13,8),%%xmm0\n"
234 "movzb (%2,%%r10,1), %%r13 \n"
235 "movq 4096(%5,%%r13,8),%%xmm1\n"
237 "paddsw %%xmm1,%%xmm0\n"
240 "movzb (%0,%%r11,1), %%r13 \n"
241 "movq (%5,%%r13,8),%%xmm1\n"
243 "paddsw %%xmm0,%%xmm1\n"
244 "psraw $0x6,%%xmm1\n"
245 "packuswb %%xmm1,%%xmm1\n"
246 "movd %%xmm1,0x0(%3)\n"
255 "r" (kCoefficientsRgbY
), // %5
256 "r"(static_cast<long>(source_dx
)) // %6
257 : "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
261 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
263 // PIC version is slower because less registers are available, so
264 // non-PIC is used on platforms where it is possible.
265 void FastConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
272 ".global FastConvertYUVToRGB32Row_SSE\n"
273 ".type FastConvertYUVToRGB32Row_SSE, @function\n"
274 "FastConvertYUVToRGB32Row_SSE:\n"
276 "mov 0x24(%esp),%edx\n"
277 "mov 0x28(%esp),%edi\n"
278 "mov 0x2c(%esp),%esi\n"
279 "mov 0x30(%esp),%ebp\n"
280 "mov 0x34(%esp),%ecx\n"
284 "movzbl (%edi),%eax\n"
286 "movzbl (%esi),%ebx\n"
288 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
289 "movzbl (%edx),%eax\n"
290 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
291 "movzbl 0x1(%edx),%ebx\n"
292 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
294 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
299 "packuswb %mm2,%mm1\n"
300 "movntq %mm1,0x0(%ebp)\n"
309 "movzbl (%edi),%eax\n"
310 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
311 "movzbl (%esi),%eax\n"
312 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
313 "movzbl (%edx),%eax\n"
314 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
317 "packuswb %mm1,%mm1\n"
318 "movd %mm1,0x0(%ebp)\n"
322 #if !defined(XP_MACOSX)
327 void FastConvertYUVToRGB32Row(const uint8
* y_buf
,
333 if (mozilla::supports_sse()) {
334 FastConvertYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
);
338 FastConvertYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, 1);
342 void ScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
350 ".global ScaleYUVToRGB32Row_SSE\n"
351 ".type ScaleYUVToRGB32Row_SSE, @function\n"
352 "ScaleYUVToRGB32Row_SSE:\n"
354 "mov 0x24(%esp),%edx\n"
355 "mov 0x28(%esp),%edi\n"
356 "mov 0x2c(%esp),%esi\n"
357 "mov 0x30(%esp),%ebp\n"
358 "mov 0x34(%esp),%ecx\n"
365 "movzbl (%edi,%eax,1),%eax\n"
366 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
369 "movzbl (%esi,%eax,1),%eax\n"
370 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
372 "add 0x38(%esp),%ebx\n"
374 "movzbl (%edx,%eax,1),%eax\n"
375 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
377 "add 0x38(%esp),%ebx\n"
379 "movzbl (%edx,%eax,1),%eax\n"
380 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
385 "packuswb %mm2,%mm1\n"
386 "movntq %mm1,0x0(%ebp)\n"
397 "movzbl (%edi,%eax,1),%eax\n"
398 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
401 "movzbl (%esi,%eax,1),%eax\n"
402 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
405 "movzbl (%edx,%eax,1),%eax\n"
406 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
409 "packuswb %mm1,%mm1\n"
410 "movd %mm1,0x0(%ebp)\n"
415 #if !defined(XP_MACOSX)
420 void ScaleYUVToRGB32Row(const uint8
* y_buf
,
427 if (mozilla::supports_sse()) {
428 ScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
,
432 ScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
,
436 void LinearScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
444 ".global LinearScaleYUVToRGB32Row_SSE\n"
445 ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
446 "LinearScaleYUVToRGB32Row_SSE:\n"
448 "mov 0x24(%esp),%edx\n"
449 "mov 0x28(%esp),%edi\n"
450 "mov 0x30(%esp),%ebp\n"
452 // source_width = width * source_dx + ebx
453 "mov 0x34(%esp), %ecx\n"
454 "imull 0x38(%esp), %ecx\n"
455 "mov %ecx, 0x34(%esp)\n"
457 "mov 0x38(%esp), %ecx\n"
458 "xor %ebx,%ebx\n" // x = 0
459 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
461 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
468 "movzbl (%edi,%eax,1),%ecx\n"
469 "movzbl 1(%edi,%eax,1),%esi\n"
471 "andl $0x1fffe, %eax \n"
473 "xorl $0x1fffe, %eax \n"
477 "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
479 "mov 0x2c(%esp),%esi\n"
483 "movzbl (%esi,%eax,1),%ecx\n"
484 "movzbl 1(%esi,%eax,1),%esi\n"
486 "andl $0x1fffe, %eax \n"
488 "xorl $0x1fffe, %eax \n"
492 "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
496 "movzbl (%edx,%eax,1),%ecx\n"
497 "movzbl 1(%edx,%eax,1),%esi\n"
499 "add 0x38(%esp),%ebx\n"
500 "andl $0xffff, %eax \n"
502 "xorl $0xffff, %eax \n"
506 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
508 "cmp 0x34(%esp), %ebx\n"
513 "movzbl (%edx,%eax,1),%ecx\n"
514 "movzbl 1(%edx,%eax,1),%esi\n"
516 "add 0x38(%esp),%ebx\n"
517 "andl $0xffff, %eax \n"
519 "xorl $0xffff, %eax \n"
523 "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
529 "packuswb %mm2,%mm1\n"
530 "movntq %mm1,0x0(%ebp)\n"
534 "cmp 0x34(%esp), %ebx\n"
540 "paddsw %mm0, %mm1\n"
542 "packuswb %mm1, %mm1\n"
543 "movd %mm1, (%ebp)\n"
546 #if !defined(XP_MACOSX)
551 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
558 if (mozilla::supports_sse()) {
559 LinearScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
,
563 LinearScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
,
567 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
569 void PICConvertYUVToRGB32Row_SSE(const uint8
* y_buf
,
574 int16
*kCoefficientsRgbY
);
578 #if defined(XP_MACOSX)
579 "_PICConvertYUVToRGB32Row_SSE:\n"
581 "PICConvertYUVToRGB32Row_SSE:\n"
584 "mov 0x24(%esp),%edx\n"
585 "mov 0x28(%esp),%edi\n"
586 "mov 0x2c(%esp),%esi\n"
587 "mov 0x30(%esp),%ebp\n"
588 "mov 0x38(%esp),%ecx\n"
593 "movzbl (%edi),%eax\n"
595 "movzbl (%esi),%ebx\n"
597 "movq 2048(%ecx,%eax,8),%mm0\n"
598 "movzbl (%edx),%eax\n"
599 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
600 "movzbl 0x1(%edx),%ebx\n"
601 "movq 0(%ecx,%eax,8),%mm1\n"
603 "movq 0(%ecx,%ebx,8),%mm2\n"
608 "packuswb %mm2,%mm1\n"
609 "movntq %mm1,0x0(%ebp)\n"
612 "subl $0x2,0x34(%esp)\n"
615 "andl $0x1,0x34(%esp)\n"
618 "movzbl (%edi),%eax\n"
619 "movq 2048(%ecx,%eax,8),%mm0\n"
620 "movzbl (%esi),%eax\n"
621 "paddsw 4096(%ecx,%eax,8),%mm0\n"
622 "movzbl (%edx),%eax\n"
623 "movq 0(%ecx,%eax,8),%mm1\n"
626 "packuswb %mm1,%mm1\n"
627 "movd %mm1,0x0(%ebp)\n"
631 #if !defined(XP_MACOSX)
636 void FastConvertYUVToRGB32Row(const uint8
* y_buf
,
642 if (mozilla::supports_sse()) {
643 PICConvertYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
,
644 &kCoefficientsRgbY
[0][0]);
648 FastConvertYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, 1);
651 void PICScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
657 int16
*kCoefficientsRgbY
);
661 #if defined(XP_MACOSX)
662 "_PICScaleYUVToRGB32Row_SSE:\n"
664 "PICScaleYUVToRGB32Row_SSE:\n"
667 "mov 0x24(%esp),%edx\n"
668 "mov 0x28(%esp),%edi\n"
669 "mov 0x2c(%esp),%esi\n"
670 "mov 0x30(%esp),%ebp\n"
671 "mov 0x3c(%esp),%ecx\n"
678 "movzbl (%edi,%eax,1),%eax\n"
679 "movq 2048(%ecx,%eax,8),%mm0\n"
682 "movzbl (%esi,%eax,1),%eax\n"
683 "paddsw 4096(%ecx,%eax,8),%mm0\n"
685 "add 0x38(%esp),%ebx\n"
687 "movzbl (%edx,%eax,1),%eax\n"
688 "movq 0(%ecx,%eax,8),%mm1\n"
690 "add 0x38(%esp),%ebx\n"
692 "movzbl (%edx,%eax,1),%eax\n"
693 "movq 0(%ecx,%eax,8),%mm2\n"
698 "packuswb %mm2,%mm1\n"
699 "movntq %mm1,0x0(%ebp)\n"
702 "subl $0x2,0x34(%esp)\n"
705 "andl $0x1,0x34(%esp)\n"
710 "movzbl (%edi,%eax,1),%eax\n"
711 "movq 2048(%ecx,%eax,8),%mm0\n"
714 "movzbl (%esi,%eax,1),%eax\n"
715 "paddsw 4096(%ecx,%eax,8),%mm0\n"
718 "movzbl (%edx,%eax,1),%eax\n"
719 "movq 0(%ecx,%eax,8),%mm1\n"
722 "packuswb %mm1,%mm1\n"
723 "movd %mm1,0x0(%ebp)\n"
728 #if !defined(XP_MACOSX)
733 void ScaleYUVToRGB32Row(const uint8
* y_buf
,
740 if (mozilla::supports_sse()) {
741 PICScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
,
742 &kCoefficientsRgbY
[0][0]);
746 ScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
749 void PICLinearScaleYUVToRGB32Row_SSE(const uint8
* y_buf
,
755 int16
*kCoefficientsRgbY
);
759 #if defined(XP_MACOSX)
760 "_PICLinearScaleYUVToRGB32Row_SSE:\n"
762 "PICLinearScaleYUVToRGB32Row_SSE:\n"
765 "mov 0x24(%esp),%edx\n"
766 "mov 0x30(%esp),%ebp\n"
767 "mov 0x34(%esp),%ecx\n"
768 "mov 0x3c(%esp),%edi\n"
771 // source_width = width * source_dx + ebx
772 "mov 0x34(%esp), %ecx\n"
773 "imull 0x38(%esp), %ecx\n"
774 "mov %ecx, 0x34(%esp)\n"
776 "mov 0x38(%esp), %ecx\n"
777 "xor %ebx,%ebx\n" // x = 0
778 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
780 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
784 "mov 0x28(%esp),%esi\n"
788 "movzbl (%esi,%eax,1),%ecx\n"
789 "movzbl 1(%esi,%eax,1),%esi\n"
791 "andl $0x1fffe, %eax \n"
793 "xorl $0x1fffe, %eax \n"
797 "movq 2048(%edi,%ecx,8),%mm0\n"
799 "mov 0x2c(%esp),%esi\n"
803 "movzbl (%esi,%eax,1),%ecx\n"
804 "movzbl 1(%esi,%eax,1),%esi\n"
806 "andl $0x1fffe, %eax \n"
808 "xorl $0x1fffe, %eax \n"
812 "paddsw 4096(%edi,%ecx,8),%mm0\n"
816 "movzbl (%edx,%eax,1),%ecx\n"
817 "movzbl 1(%edx,%eax,1),%esi\n"
819 "add 0x38(%esp),%ebx\n"
820 "andl $0xffff, %eax \n"
822 "xorl $0xffff, %eax \n"
826 "movq (%edi,%ecx,8),%mm1\n"
828 "cmp 0x34(%esp), %ebx\n"
833 "movzbl (%edx,%eax,1),%ecx\n"
834 "movzbl 1(%edx,%eax,1),%esi\n"
836 "add 0x38(%esp),%ebx\n"
837 "andl $0xffff, %eax \n"
839 "xorl $0xffff, %eax \n"
843 "movq (%edi,%ecx,8),%mm2\n"
849 "packuswb %mm2,%mm1\n"
850 "movntq %mm1,0x0(%ebp)\n"
854 "cmp %ebx, 0x34(%esp)\n"
860 "paddsw %mm0, %mm1\n"
862 "packuswb %mm1, %mm1\n"
863 "movd %mm1, (%ebp)\n"
866 #if !defined(XP_MACOSX)
872 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
879 if (mozilla::supports_sse()) {
880 PICLinearScaleYUVToRGB32Row_SSE(y_buf
, u_buf
, v_buf
, rgb_buf
, width
,
881 source_dx
, &kCoefficientsRgbY
[0][0]);
885 LinearScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
888 void FastConvertYUVToRGB32Row(const uint8
* y_buf
,
893 FastConvertYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, 1);
896 void ScaleYUVToRGB32Row(const uint8
* y_buf
,
902 ScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);
905 void LinearScaleYUVToRGB32Row(const uint8
* y_buf
,
911 LinearScaleYUVToRGB32Row_C(y_buf
, u_buf
, v_buf
, rgb_buf
, width
, source_dx
);