Bug 1857841 - pt 3. Add a new page kind named "fresh" r=glandium
[gecko.git] / gfx / ycbcr / yuv_row_posix.cpp
blobc5e55abe4cec983611273a4634273f45067c11c3
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
5 #include "yuv_row.h"
6 #include "mozilla/SSE.h"
8 #define DCHECK(a)
10 extern "C" {
12 #if defined(ARCH_CPU_X86_64)
14 // We don't need CPUID guards here, since x86-64 implies SSE2.
16 // AMD64 ABI uses register paremters.
17 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, // rdi
18 const uint8_t* u_buf, // rsi
19 const uint8_t* v_buf, // rdx
20 uint8_t* rgb_buf, // rcx
21 int width) { // r8
22 asm volatile(
23 "jmp 1f\n"
24 "0:"
25 "movzb (%[u_buf]),%%r10\n"
26 "add $0x1,%[u_buf]\n"
27 "movzb (%[v_buf]),%%r11\n"
28 "add $0x1,%[v_buf]\n"
29 "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
30 "movzb (%[y_buf]),%%r10\n"
31 "movq 4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n"
32 "movzb 0x1(%[y_buf]),%%r11\n"
33 "paddsw %%xmm1,%%xmm0\n"
34 "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n"
35 "add $0x2,%[y_buf]\n"
36 "movq (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n"
37 "paddsw %%xmm0,%%xmm2\n"
38 "paddsw %%xmm0,%%xmm3\n"
39 "shufps $0x44,%%xmm3,%%xmm2\n"
40 "psraw $0x6,%%xmm2\n"
41 "packuswb %%xmm2,%%xmm2\n"
42 "movq %%xmm2,0x0(%[rgb_buf])\n"
43 "add $0x8,%[rgb_buf]\n"
44 "1:"
45 "sub $0x2,%[width]\n"
46 "jns 0b\n"
48 "2:"
49 "add $0x1,%[width]\n"
50 "js 3f\n"
52 "movzb (%[u_buf]),%%r10\n"
53 "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
54 "movzb (%[v_buf]),%%r10\n"
55 "movq 4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
56 "paddsw %%xmm1,%%xmm0\n"
57 "movzb (%[y_buf]),%%r10\n"
58 "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
59 "paddsw %%xmm0,%%xmm1\n"
60 "psraw $0x6,%%xmm1\n"
61 "packuswb %%xmm1,%%xmm1\n"
62 "movd %%xmm1,0x0(%[rgb_buf])\n"
63 "3:"
64 : [y_buf] "+r"(y_buf),
65 [u_buf] "+r"(u_buf),
66 [v_buf] "+r"(v_buf),
67 [rgb_buf] "+r"(rgb_buf),
68 [width] "+r"(width)
69 : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
70 : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
74 void ScaleYUVToRGB32Row(const uint8_t* y_buf, // rdi
75 const uint8_t* u_buf, // rsi
76 const uint8_t* v_buf, // rdx
77 uint8_t* rgb_buf, // rcx
78 int width, // r8
79 int source_dx) { // r9
80 asm volatile(
81 "xor %%r11,%%r11\n"
82 "sub $0x2,%[width]\n"
83 "js 1f\n"
85 "0:"
86 "mov %%r11,%%r10\n"
87 "sar $0x11,%%r10\n"
88 "movzb (%[u_buf],%%r10,1),%%rax\n"
89 "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
90 "movzb (%[v_buf],%%r10,1),%%rax\n"
91 "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
92 "lea (%%r11,%[source_dx]),%%r10\n"
93 "sar $0x10,%%r11\n"
94 "movzb (%[y_buf],%%r11,1),%%rax\n"
95 "paddsw %%xmm1,%%xmm0\n"
96 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
97 "lea (%%r10,%[source_dx]),%%r11\n"
98 "sar $0x10,%%r10\n"
99 "movzb (%[y_buf],%%r10,1),%%rax\n"
100 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n"
101 "paddsw %%xmm0,%%xmm1\n"
102 "paddsw %%xmm0,%%xmm2\n"
103 "shufps $0x44,%%xmm2,%%xmm1\n"
104 "psraw $0x6,%%xmm1\n"
105 "packuswb %%xmm1,%%xmm1\n"
106 "movq %%xmm1,0x0(%[rgb_buf])\n"
107 "add $0x8,%[rgb_buf]\n"
108 "sub $0x2,%[width]\n"
109 "jns 0b\n"
111 "1:"
112 "add $0x1,%[width]\n"
113 "js 2f\n"
115 "mov %%r11,%%r10\n"
116 "sar $0x11,%%r10\n"
117 "movzb (%[u_buf],%%r10,1),%%rax\n"
118 "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
119 "movzb (%[v_buf],%%r10,1),%%rax\n"
120 "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
121 "paddsw %%xmm1,%%xmm0\n"
122 "sar $0x10,%%r11\n"
123 "movzb (%[y_buf],%%r11,1),%%rax\n"
124 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
125 "paddsw %%xmm0,%%xmm1\n"
126 "psraw $0x6,%%xmm1\n"
127 "packuswb %%xmm1,%%xmm1\n"
128 "movd %%xmm1,0x0(%[rgb_buf])\n"
130 "2:"
131 : [rgb_buf] "+r"(rgb_buf),
132 [width] "+r"(width)
133 : [y_buf] "r"(y_buf),
134 [u_buf] "r"(u_buf),
135 [v_buf] "r"(v_buf),
136 [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
137 [source_dx] "r"(static_cast<long>(source_dx))
138 : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
142 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
143 const uint8_t* u_buf,
144 const uint8_t* v_buf,
145 uint8_t* rgb_buf,
146 int width,
147 int source_dx) {
148 asm volatile(
149 "xor %%r11,%%r11\n" // x = 0
150 "sub $0x2,%[width]\n"
151 "js 2f\n"
152 "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0
153 "jl 0f\n"
154 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
155 "0:"
157 "1:"
158 "mov %%r11,%%r10\n"
159 "sar $0x11,%%r10\n"
161 "movzb (%[u_buf], %%r10, 1), %%r13 \n"
162 "movzb 1(%[u_buf], %%r10, 1), %%r14 \n"
163 "mov %%r11, %%rax \n"
164 "and $0x1fffe, %%rax \n"
165 "imul %%rax, %%r14 \n"
166 "xor $0x1fffe, %%rax \n"
167 "imul %%rax, %%r13 \n"
168 "add %%r14, %%r13 \n"
169 "shr $17, %%r13 \n"
170 "movq 2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n"
172 "movzb (%[v_buf], %%r10, 1), %%r13 \n"
173 "movzb 1(%[v_buf], %%r10, 1), %%r14 \n"
174 "mov %%r11, %%rax \n"
175 "and $0x1fffe, %%rax \n"
176 "imul %%rax, %%r14 \n"
177 "xor $0x1fffe, %%rax \n"
178 "imul %%rax, %%r13 \n"
179 "add %%r14, %%r13 \n"
180 "shr $17, %%r13 \n"
181 "movq 4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n"
183 "mov %%r11, %%rax \n"
184 "lea (%%r11,%[source_dx]),%%r10\n"
185 "sar $0x10,%%r11\n"
186 "paddsw %%xmm1,%%xmm0\n"
188 "movzb (%[y_buf], %%r11, 1), %%r13 \n"
189 "movzb 1(%[y_buf], %%r11, 1), %%r14 \n"
190 "and $0xffff, %%rax \n"
191 "imul %%rax, %%r14 \n"
192 "xor $0xffff, %%rax \n"
193 "imul %%rax, %%r13 \n"
194 "add %%r14, %%r13 \n"
195 "shr $16, %%r13 \n"
196 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
198 "mov %%r10, %%rax \n"
199 "lea (%%r10,%[source_dx]),%%r11\n"
200 "sar $0x10,%%r10\n"
202 "movzb (%[y_buf],%%r10,1), %%r13 \n"
203 "movzb 1(%[y_buf],%%r10,1), %%r14 \n"
204 "and $0xffff, %%rax \n"
205 "imul %%rax, %%r14 \n"
206 "xor $0xffff, %%rax \n"
207 "imul %%rax, %%r13 \n"
208 "add %%r14, %%r13 \n"
209 "shr $16, %%r13 \n"
210 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n"
212 "paddsw %%xmm0,%%xmm1\n"
213 "paddsw %%xmm0,%%xmm2\n"
214 "shufps $0x44,%%xmm2,%%xmm1\n"
215 "psraw $0x6,%%xmm1\n"
216 "packuswb %%xmm1,%%xmm1\n"
217 "movq %%xmm1,0x0(%[rgb_buf])\n"
218 "add $0x8,%[rgb_buf]\n"
219 "sub $0x2,%[width]\n"
220 "jns 1b\n"
222 "2:"
223 "add $0x1,%[width]\n"
224 "js 3f\n"
226 "mov %%r11,%%r10\n"
227 "sar $0x11,%%r10\n"
229 "movzb (%[u_buf],%%r10,1), %%r13 \n"
230 "movq 2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n"
232 "movzb (%[v_buf],%%r10,1), %%r13 \n"
233 "movq 4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
235 "paddsw %%xmm1,%%xmm0\n"
236 "sar $0x10,%%r11\n"
238 "movzb (%[y_buf],%%r11,1), %%r13 \n"
239 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
241 "paddsw %%xmm0,%%xmm1\n"
242 "psraw $0x6,%%xmm1\n"
243 "packuswb %%xmm1,%%xmm1\n"
244 "movd %%xmm1,0x0(%[rgb_buf])\n"
246 "3:"
247 : [rgb_buf] "+r"(rgb_buf),
248 [width] "+r"(width)
249 : [y_buf] "r"(y_buf),
250 [u_buf] "r"(u_buf),
251 [v_buf] "r"(v_buf),
252 [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
253 [source_dx] "r"(static_cast<long>(source_dx))
254 : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
258 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
260 // PIC version is slower because less registers are available, so
261 // non-PIC is used on platforms where it is possible.
262 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
263 const uint8_t* u_buf,
264 const uint8_t* v_buf,
265 uint8_t* rgb_buf,
266 int width);
267 asm(
268 ".text\n"
269 ".global FastConvertYUVToRGB32Row_SSE\n"
270 ".type FastConvertYUVToRGB32Row_SSE, @function\n"
271 "FastConvertYUVToRGB32Row_SSE:\n"
272 "pusha\n"
273 "mov 0x24(%esp),%edx\n"
274 "mov 0x28(%esp),%edi\n"
275 "mov 0x2c(%esp),%esi\n"
276 "mov 0x30(%esp),%ebp\n"
277 "mov 0x34(%esp),%ecx\n"
278 "jmp 1f\n"
280 "0:"
281 "movzbl (%edi),%eax\n"
282 "add $0x1,%edi\n"
283 "movzbl (%esi),%ebx\n"
284 "add $0x1,%esi\n"
285 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
286 "movzbl (%edx),%eax\n"
287 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
288 "movzbl 0x1(%edx),%ebx\n"
289 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
290 "add $0x2,%edx\n"
291 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n"
292 "paddsw %mm0,%mm1\n"
293 "paddsw %mm0,%mm2\n"
294 "psraw $0x6,%mm1\n"
295 "psraw $0x6,%mm2\n"
296 "packuswb %mm2,%mm1\n"
297 "movntq %mm1,0x0(%ebp)\n"
298 "add $0x8,%ebp\n"
299 "1:"
300 "sub $0x2,%ecx\n"
301 "jns 0b\n"
303 "and $0x1,%ecx\n"
304 "je 2f\n"
306 "movzbl (%edi),%eax\n"
307 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
308 "movzbl (%esi),%eax\n"
309 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
310 "movzbl (%edx),%eax\n"
311 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
312 "paddsw %mm0,%mm1\n"
313 "psraw $0x6,%mm1\n"
314 "packuswb %mm1,%mm1\n"
315 "movd %mm1,0x0(%ebp)\n"
316 "2:"
317 "popa\n"
318 "ret\n"
319 #if !defined(XP_MACOSX)
320 ".previous\n"
321 #endif
324 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
325 const uint8_t* u_buf,
326 const uint8_t* v_buf,
327 uint8_t* rgb_buf,
328 int width)
330 if (mozilla::supports_sse()) {
331 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
332 return;
335 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
339 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
340 const uint8_t* u_buf,
341 const uint8_t* v_buf,
342 uint8_t* rgb_buf,
343 int width,
344 int source_dx);
345 asm(
346 ".text\n"
347 ".global ScaleYUVToRGB32Row_SSE\n"
348 ".type ScaleYUVToRGB32Row_SSE, @function\n"
349 "ScaleYUVToRGB32Row_SSE:\n"
350 "pusha\n"
351 "mov 0x24(%esp),%edx\n"
352 "mov 0x28(%esp),%edi\n"
353 "mov 0x2c(%esp),%esi\n"
354 "mov 0x30(%esp),%ebp\n"
355 "mov 0x34(%esp),%ecx\n"
356 "xor %ebx,%ebx\n"
357 "jmp 1f\n"
359 "0:"
360 "mov %ebx,%eax\n"
361 "sar $0x11,%eax\n"
362 "movzbl (%edi,%eax,1),%eax\n"
363 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
364 "mov %ebx,%eax\n"
365 "sar $0x11,%eax\n"
366 "movzbl (%esi,%eax,1),%eax\n"
367 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
368 "mov %ebx,%eax\n"
369 "add 0x38(%esp),%ebx\n"
370 "sar $0x10,%eax\n"
371 "movzbl (%edx,%eax,1),%eax\n"
372 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
373 "mov %ebx,%eax\n"
374 "add 0x38(%esp),%ebx\n"
375 "sar $0x10,%eax\n"
376 "movzbl (%edx,%eax,1),%eax\n"
377 "movq kCoefficientsRgbY(,%eax,8),%mm2\n"
378 "paddsw %mm0,%mm1\n"
379 "paddsw %mm0,%mm2\n"
380 "psraw $0x6,%mm1\n"
381 "psraw $0x6,%mm2\n"
382 "packuswb %mm2,%mm1\n"
383 "movntq %mm1,0x0(%ebp)\n"
384 "add $0x8,%ebp\n"
385 "1:"
386 "sub $0x2,%ecx\n"
387 "jns 0b\n"
389 "and $0x1,%ecx\n"
390 "je 2f\n"
392 "mov %ebx,%eax\n"
393 "sar $0x11,%eax\n"
394 "movzbl (%edi,%eax,1),%eax\n"
395 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
396 "mov %ebx,%eax\n"
397 "sar $0x11,%eax\n"
398 "movzbl (%esi,%eax,1),%eax\n"
399 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
400 "mov %ebx,%eax\n"
401 "sar $0x10,%eax\n"
402 "movzbl (%edx,%eax,1),%eax\n"
403 "movq kCoefficientsRgbY(,%eax,8),%mm1\n"
404 "paddsw %mm0,%mm1\n"
405 "psraw $0x6,%mm1\n"
406 "packuswb %mm1,%mm1\n"
407 "movd %mm1,0x0(%ebp)\n"
409 "2:"
410 "popa\n"
411 "ret\n"
412 #if !defined(XP_MACOSX)
413 ".previous\n"
414 #endif
417 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
418 const uint8_t* u_buf,
419 const uint8_t* v_buf,
420 uint8_t* rgb_buf,
421 int width,
422 int source_dx)
424 if (mozilla::supports_sse()) {
425 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
426 width, source_dx);
427 return;
430 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
431 width, source_dx);
434 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
435 const uint8_t* u_buf,
436 const uint8_t* v_buf,
437 uint8_t* rgb_buf,
438 int width,
439 int source_dx);
440 asm(
441 ".text\n"
442 ".global LinearScaleYUVToRGB32Row_SSE\n"
443 ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
444 "LinearScaleYUVToRGB32Row_SSE:\n"
445 "pusha\n"
446 "mov 0x24(%esp),%edx\n"
447 "mov 0x28(%esp),%edi\n"
448 "mov 0x30(%esp),%ebp\n"
450 // source_width = width * source_dx + ebx
451 "mov 0x34(%esp), %ecx\n"
452 "imull 0x38(%esp), %ecx\n"
453 "mov %ecx, 0x34(%esp)\n"
455 "mov 0x38(%esp), %ecx\n"
456 "xor %ebx,%ebx\n" // x = 0
457 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
458 "jl 1f\n"
459 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
460 "jmp 1f\n"
462 "0:"
463 "mov %ebx,%eax\n"
464 "sar $0x11,%eax\n"
466 "movzbl (%edi,%eax,1),%ecx\n"
467 "movzbl 1(%edi,%eax,1),%esi\n"
468 "mov %ebx,%eax\n"
469 "andl $0x1fffe, %eax \n"
470 "imul %eax, %esi \n"
471 "xorl $0x1fffe, %eax \n"
472 "imul %eax, %ecx \n"
473 "addl %esi, %ecx \n"
474 "shrl $17, %ecx \n"
475 "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
477 "mov 0x2c(%esp),%esi\n"
478 "mov %ebx,%eax\n"
479 "sar $0x11,%eax\n"
481 "movzbl (%esi,%eax,1),%ecx\n"
482 "movzbl 1(%esi,%eax,1),%esi\n"
483 "mov %ebx,%eax\n"
484 "andl $0x1fffe, %eax \n"
485 "imul %eax, %esi \n"
486 "xorl $0x1fffe, %eax \n"
487 "imul %eax, %ecx \n"
488 "addl %esi, %ecx \n"
489 "shrl $17, %ecx \n"
490 "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
492 "mov %ebx,%eax\n"
493 "sar $0x10,%eax\n"
494 "movzbl (%edx,%eax,1),%ecx\n"
495 "movzbl 1(%edx,%eax,1),%esi\n"
496 "mov %ebx,%eax\n"
497 "add 0x38(%esp),%ebx\n"
498 "andl $0xffff, %eax \n"
499 "imul %eax, %esi \n"
500 "xorl $0xffff, %eax \n"
501 "imul %eax, %ecx \n"
502 "addl %esi, %ecx \n"
503 "shrl $16, %ecx \n"
504 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n"
506 "cmp 0x34(%esp), %ebx\n"
507 "jge 2f\n"
509 "mov %ebx,%eax\n"
510 "sar $0x10,%eax\n"
511 "movzbl (%edx,%eax,1),%ecx\n"
512 "movzbl 1(%edx,%eax,1),%esi\n"
513 "mov %ebx,%eax\n"
514 "add 0x38(%esp),%ebx\n"
515 "andl $0xffff, %eax \n"
516 "imul %eax, %esi \n"
517 "xorl $0xffff, %eax \n"
518 "imul %eax, %ecx \n"
519 "addl %esi, %ecx \n"
520 "shrl $16, %ecx \n"
521 "movq kCoefficientsRgbY(,%ecx,8),%mm2\n"
523 "paddsw %mm0,%mm1\n"
524 "paddsw %mm0,%mm2\n"
525 "psraw $0x6,%mm1\n"
526 "psraw $0x6,%mm2\n"
527 "packuswb %mm2,%mm1\n"
528 "movntq %mm1,0x0(%ebp)\n"
529 "add $0x8,%ebp\n"
531 "1:"
532 "cmp 0x34(%esp), %ebx\n"
533 "jl 0b\n"
534 "popa\n"
535 "ret\n"
537 "2:"
538 "paddsw %mm0, %mm1\n"
539 "psraw $6, %mm1\n"
540 "packuswb %mm1, %mm1\n"
541 "movd %mm1, (%ebp)\n"
542 "popa\n"
543 "ret\n"
544 #if !defined(XP_MACOSX)
545 ".previous\n"
546 #endif
549 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
550 const uint8_t* u_buf,
551 const uint8_t* v_buf,
552 uint8_t* rgb_buf,
553 int width,
554 int source_dx)
556 if (mozilla::supports_sse()) {
557 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
558 width, source_dx);
559 return;
562 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
563 width, source_dx);
566 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
568 void PICConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
569 const uint8_t* u_buf,
570 const uint8_t* v_buf,
571 uint8_t* rgb_buf,
572 int width,
573 const int16_t *kCoefficientsRgbY);
575 asm(
576 ".text\n"
577 #if defined(XP_MACOSX)
578 "_PICConvertYUVToRGB32Row_SSE:\n"
579 #else
580 "PICConvertYUVToRGB32Row_SSE:\n"
581 #endif
582 "pusha\n"
583 "mov 0x24(%esp),%edx\n"
584 "mov 0x28(%esp),%edi\n"
585 "mov 0x2c(%esp),%esi\n"
586 "mov 0x30(%esp),%ebp\n"
587 "mov 0x38(%esp),%ecx\n"
589 "jmp 1f\n"
591 "0:"
592 "movzbl (%edi),%eax\n"
593 "add $0x1,%edi\n"
594 "movzbl (%esi),%ebx\n"
595 "add $0x1,%esi\n"
596 "movq 2048(%ecx,%eax,8),%mm0\n"
597 "movzbl (%edx),%eax\n"
598 "paddsw 4096(%ecx,%ebx,8),%mm0\n"
599 "movzbl 0x1(%edx),%ebx\n"
600 "movq 0(%ecx,%eax,8),%mm1\n"
601 "add $0x2,%edx\n"
602 "movq 0(%ecx,%ebx,8),%mm2\n"
603 "paddsw %mm0,%mm1\n"
604 "paddsw %mm0,%mm2\n"
605 "psraw $0x6,%mm1\n"
606 "psraw $0x6,%mm2\n"
607 "packuswb %mm2,%mm1\n"
608 "movntq %mm1,0x0(%ebp)\n"
609 "add $0x8,%ebp\n"
610 "1:"
611 "subl $0x2,0x34(%esp)\n"
612 "jns 0b\n"
614 "andl $0x1,0x34(%esp)\n"
615 "je 2f\n"
617 "movzbl (%edi),%eax\n"
618 "movq 2048(%ecx,%eax,8),%mm0\n"
619 "movzbl (%esi),%eax\n"
620 "paddsw 4096(%ecx,%eax,8),%mm0\n"
621 "movzbl (%edx),%eax\n"
622 "movq 0(%ecx,%eax,8),%mm1\n"
623 "paddsw %mm0,%mm1\n"
624 "psraw $0x6,%mm1\n"
625 "packuswb %mm1,%mm1\n"
626 "movd %mm1,0x0(%ebp)\n"
627 "2:"
628 "popa\n"
629 "ret\n"
630 #if !defined(XP_MACOSX)
631 ".previous\n"
632 #endif
635 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
636 const uint8_t* u_buf,
637 const uint8_t* v_buf,
638 uint8_t* rgb_buf,
639 int width)
641 if (mozilla::supports_sse()) {
642 PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
643 &kCoefficientsRgbY[0][0]);
644 return;
647 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
650 void PICScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
651 const uint8_t* u_buf,
652 const uint8_t* v_buf,
653 uint8_t* rgb_buf,
654 int width,
655 int source_dx,
656 const int16_t *kCoefficientsRgbY);
658 asm(
659 ".text\n"
660 #if defined(XP_MACOSX)
661 "_PICScaleYUVToRGB32Row_SSE:\n"
662 #else
663 "PICScaleYUVToRGB32Row_SSE:\n"
664 #endif
665 "pusha\n"
666 "mov 0x24(%esp),%edx\n"
667 "mov 0x28(%esp),%edi\n"
668 "mov 0x2c(%esp),%esi\n"
669 "mov 0x30(%esp),%ebp\n"
670 "mov 0x3c(%esp),%ecx\n"
671 "xor %ebx,%ebx\n"
672 "jmp 1f\n"
674 "0:"
675 "mov %ebx,%eax\n"
676 "sar $0x11,%eax\n"
677 "movzbl (%edi,%eax,1),%eax\n"
678 "movq 2048(%ecx,%eax,8),%mm0\n"
679 "mov %ebx,%eax\n"
680 "sar $0x11,%eax\n"
681 "movzbl (%esi,%eax,1),%eax\n"
682 "paddsw 4096(%ecx,%eax,8),%mm0\n"
683 "mov %ebx,%eax\n"
684 "add 0x38(%esp),%ebx\n"
685 "sar $0x10,%eax\n"
686 "movzbl (%edx,%eax,1),%eax\n"
687 "movq 0(%ecx,%eax,8),%mm1\n"
688 "mov %ebx,%eax\n"
689 "add 0x38(%esp),%ebx\n"
690 "sar $0x10,%eax\n"
691 "movzbl (%edx,%eax,1),%eax\n"
692 "movq 0(%ecx,%eax,8),%mm2\n"
693 "paddsw %mm0,%mm1\n"
694 "paddsw %mm0,%mm2\n"
695 "psraw $0x6,%mm1\n"
696 "psraw $0x6,%mm2\n"
697 "packuswb %mm2,%mm1\n"
698 "movntq %mm1,0x0(%ebp)\n"
699 "add $0x8,%ebp\n"
700 "1:"
701 "subl $0x2,0x34(%esp)\n"
702 "jns 0b\n"
704 "andl $0x1,0x34(%esp)\n"
705 "je 2f\n"
707 "mov %ebx,%eax\n"
708 "sar $0x11,%eax\n"
709 "movzbl (%edi,%eax,1),%eax\n"
710 "movq 2048(%ecx,%eax,8),%mm0\n"
711 "mov %ebx,%eax\n"
712 "sar $0x11,%eax\n"
713 "movzbl (%esi,%eax,1),%eax\n"
714 "paddsw 4096(%ecx,%eax,8),%mm0\n"
715 "mov %ebx,%eax\n"
716 "sar $0x10,%eax\n"
717 "movzbl (%edx,%eax,1),%eax\n"
718 "movq 0(%ecx,%eax,8),%mm1\n"
719 "paddsw %mm0,%mm1\n"
720 "psraw $0x6,%mm1\n"
721 "packuswb %mm1,%mm1\n"
722 "movd %mm1,0x0(%ebp)\n"
724 "2:"
725 "popa\n"
726 "ret\n"
727 #if !defined(XP_MACOSX)
728 ".previous\n"
729 #endif
732 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
733 const uint8_t* u_buf,
734 const uint8_t* v_buf,
735 uint8_t* rgb_buf,
736 int width,
737 int source_dx)
739 if (mozilla::supports_sse()) {
740 PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
741 &kCoefficientsRgbY[0][0]);
742 return;
745 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
748 void PICLinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
749 const uint8_t* u_buf,
750 const uint8_t* v_buf,
751 uint8_t* rgb_buf,
752 int width,
753 int source_dx,
754 const int16_t *kCoefficientsRgbY);
756 asm(
757 ".text\n"
758 #if defined(XP_MACOSX)
759 "_PICLinearScaleYUVToRGB32Row_SSE:\n"
760 #else
761 "PICLinearScaleYUVToRGB32Row_SSE:\n"
762 #endif
763 "pusha\n"
764 "mov 0x24(%esp),%edx\n"
765 "mov 0x30(%esp),%ebp\n"
766 "mov 0x34(%esp),%ecx\n"
767 "mov 0x3c(%esp),%edi\n"
768 "xor %ebx,%ebx\n"
770 // source_width = width * source_dx + ebx
771 "mov 0x34(%esp), %ecx\n"
772 "imull 0x38(%esp), %ecx\n"
773 "mov %ecx, 0x34(%esp)\n"
775 "mov 0x38(%esp), %ecx\n"
776 "xor %ebx,%ebx\n" // x = 0
777 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0
778 "jl 1f\n"
779 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less
780 "jmp 1f\n"
782 "0:"
783 "mov 0x28(%esp),%esi\n"
784 "mov %ebx,%eax\n"
785 "sar $0x11,%eax\n"
787 "movzbl (%esi,%eax,1),%ecx\n"
788 "movzbl 1(%esi,%eax,1),%esi\n"
789 "mov %ebx,%eax\n"
790 "andl $0x1fffe, %eax \n"
791 "imul %eax, %esi \n"
792 "xorl $0x1fffe, %eax \n"
793 "imul %eax, %ecx \n"
794 "addl %esi, %ecx \n"
795 "shrl $17, %ecx \n"
796 "movq 2048(%edi,%ecx,8),%mm0\n"
798 "mov 0x2c(%esp),%esi\n"
799 "mov %ebx,%eax\n"
800 "sar $0x11,%eax\n"
802 "movzbl (%esi,%eax,1),%ecx\n"
803 "movzbl 1(%esi,%eax,1),%esi\n"
804 "mov %ebx,%eax\n"
805 "andl $0x1fffe, %eax \n"
806 "imul %eax, %esi \n"
807 "xorl $0x1fffe, %eax \n"
808 "imul %eax, %ecx \n"
809 "addl %esi, %ecx \n"
810 "shrl $17, %ecx \n"
811 "paddsw 4096(%edi,%ecx,8),%mm0\n"
813 "mov %ebx,%eax\n"
814 "sar $0x10,%eax\n"
815 "movzbl (%edx,%eax,1),%ecx\n"
816 "movzbl 1(%edx,%eax,1),%esi\n"
817 "mov %ebx,%eax\n"
818 "add 0x38(%esp),%ebx\n"
819 "andl $0xffff, %eax \n"
820 "imul %eax, %esi \n"
821 "xorl $0xffff, %eax \n"
822 "imul %eax, %ecx \n"
823 "addl %esi, %ecx \n"
824 "shrl $16, %ecx \n"
825 "movq (%edi,%ecx,8),%mm1\n"
827 "cmp 0x34(%esp), %ebx\n"
828 "jge 2f\n"
830 "mov %ebx,%eax\n"
831 "sar $0x10,%eax\n"
832 "movzbl (%edx,%eax,1),%ecx\n"
833 "movzbl 1(%edx,%eax,1),%esi\n"
834 "mov %ebx,%eax\n"
835 "add 0x38(%esp),%ebx\n"
836 "andl $0xffff, %eax \n"
837 "imul %eax, %esi \n"
838 "xorl $0xffff, %eax \n"
839 "imul %eax, %ecx \n"
840 "addl %esi, %ecx \n"
841 "shrl $16, %ecx \n"
842 "movq (%edi,%ecx,8),%mm2\n"
844 "paddsw %mm0,%mm1\n"
845 "paddsw %mm0,%mm2\n"
846 "psraw $0x6,%mm1\n"
847 "psraw $0x6,%mm2\n"
848 "packuswb %mm2,%mm1\n"
849 "movntq %mm1,0x0(%ebp)\n"
850 "add $0x8,%ebp\n"
852 "1:"
853 "cmp %ebx, 0x34(%esp)\n"
854 "jg 0b\n"
855 "popa\n"
856 "ret\n"
858 "2:"
859 "paddsw %mm0, %mm1\n"
860 "psraw $6, %mm1\n"
861 "packuswb %mm1, %mm1\n"
862 "movd %mm1, (%ebp)\n"
863 "popa\n"
864 "ret\n"
865 #if !defined(XP_MACOSX)
866 ".previous\n"
867 #endif
871 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
872 const uint8_t* u_buf,
873 const uint8_t* v_buf,
874 uint8_t* rgb_buf,
875 int width,
876 int source_dx)
878 if (mozilla::supports_sse()) {
879 PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
880 source_dx, &kCoefficientsRgbY[0][0]);
881 return;
884 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
886 #else
887 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
888 const uint8_t* u_buf,
889 const uint8_t* v_buf,
890 uint8_t* rgb_buf,
891 int width) {
892 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
895 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
896 const uint8_t* u_buf,
897 const uint8_t* v_buf,
898 uint8_t* rgb_buf,
899 int width,
900 int source_dx) {
901 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
904 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
905 const uint8_t* u_buf,
906 const uint8_t* v_buf,
907 uint8_t* rgb_buf,
908 int width,
909 int source_dx) {
910 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
912 #endif