Support BGR555, BGR565, RGB555 and RGB565 foreign endian output in
[mplayer/glamo.git] / libswscale / rgb2rgb_template.c
blob8d027c1d7a207c49761ad7c66826c473cac1f512
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
30 #include <stddef.h>
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PAVGB
39 #if HAVE_SSE2
40 #define MMREG_SIZE 16
41 #else
42 #define MMREG_SIZE 8
43 #endif
45 #if HAVE_AMD3DNOW
46 #define PREFETCH "prefetch"
47 #define PAVGB "pavgusb"
48 #elif HAVE_MMX2
49 #define PREFETCH "prefetchnta"
50 #define PAVGB "pavgb"
51 #else
52 #define PREFETCH " # nop"
53 #endif
55 #if HAVE_AMD3DNOW
56 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
57 #define EMMS "femms"
58 #else
59 #define EMMS "emms"
60 #endif
62 #if HAVE_MMX2
63 #define MOVNTQ "movntq"
64 #define SFENCE "sfence"
65 #else
66 #define MOVNTQ "movq"
67 #define SFENCE " # nop"
68 #endif
70 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
72 uint8_t *dest = dst;
73 const uint8_t *s = src;
74 const uint8_t *end;
75 #if HAVE_MMX
76 const uint8_t *mm_end;
77 #endif
78 end = s + src_size;
79 #if HAVE_MMX
80 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
81 mm_end = end - 23;
82 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
83 while (s < mm_end) {
84 __asm__ volatile(
85 PREFETCH" 32%1 \n\t"
86 "movd %1, %%mm0 \n\t"
87 "punpckldq 3%1, %%mm0 \n\t"
88 "movd 6%1, %%mm1 \n\t"
89 "punpckldq 9%1, %%mm1 \n\t"
90 "movd 12%1, %%mm2 \n\t"
91 "punpckldq 15%1, %%mm2 \n\t"
92 "movd 18%1, %%mm3 \n\t"
93 "punpckldq 21%1, %%mm3 \n\t"
94 "por %%mm7, %%mm0 \n\t"
95 "por %%mm7, %%mm1 \n\t"
96 "por %%mm7, %%mm2 \n\t"
97 "por %%mm7, %%mm3 \n\t"
98 MOVNTQ" %%mm0, %0 \n\t"
99 MOVNTQ" %%mm1, 8%0 \n\t"
100 MOVNTQ" %%mm2, 16%0 \n\t"
101 MOVNTQ" %%mm3, 24%0"
102 :"=m"(*dest)
103 :"m"(*s)
104 :"memory");
105 dest += 32;
106 s += 24;
108 __asm__ volatile(SFENCE:::"memory");
109 __asm__ volatile(EMMS:::"memory");
110 #endif
111 while (s < end) {
112 #if HAVE_BIGENDIAN
113 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
114 *dest++ = 255;
115 *dest++ = s[2];
116 *dest++ = s[1];
117 *dest++ = s[0];
118 s+=3;
119 #else
120 *dest++ = *s++;
121 *dest++ = *s++;
122 *dest++ = *s++;
123 *dest++ = 255;
124 #endif
128 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
130 uint8_t *dest = dst;
131 const uint8_t *s = src;
132 const uint8_t *end;
133 #if HAVE_MMX
134 const uint8_t *mm_end;
135 #endif
136 end = s + src_size;
137 #if HAVE_MMX
138 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
139 mm_end = end - 31;
140 while (s < mm_end) {
141 __asm__ volatile(
142 PREFETCH" 32%1 \n\t"
143 "movq %1, %%mm0 \n\t"
144 "movq 8%1, %%mm1 \n\t"
145 "movq 16%1, %%mm4 \n\t"
146 "movq 24%1, %%mm5 \n\t"
147 "movq %%mm0, %%mm2 \n\t"
148 "movq %%mm1, %%mm3 \n\t"
149 "movq %%mm4, %%mm6 \n\t"
150 "movq %%mm5, %%mm7 \n\t"
151 "psrlq $8, %%mm2 \n\t"
152 "psrlq $8, %%mm3 \n\t"
153 "psrlq $8, %%mm6 \n\t"
154 "psrlq $8, %%mm7 \n\t"
155 "pand %2, %%mm0 \n\t"
156 "pand %2, %%mm1 \n\t"
157 "pand %2, %%mm4 \n\t"
158 "pand %2, %%mm5 \n\t"
159 "pand %3, %%mm2 \n\t"
160 "pand %3, %%mm3 \n\t"
161 "pand %3, %%mm6 \n\t"
162 "pand %3, %%mm7 \n\t"
163 "por %%mm2, %%mm0 \n\t"
164 "por %%mm3, %%mm1 \n\t"
165 "por %%mm6, %%mm4 \n\t"
166 "por %%mm7, %%mm5 \n\t"
168 "movq %%mm1, %%mm2 \n\t"
169 "movq %%mm4, %%mm3 \n\t"
170 "psllq $48, %%mm2 \n\t"
171 "psllq $32, %%mm3 \n\t"
172 "pand %4, %%mm2 \n\t"
173 "pand %5, %%mm3 \n\t"
174 "por %%mm2, %%mm0 \n\t"
175 "psrlq $16, %%mm1 \n\t"
176 "psrlq $32, %%mm4 \n\t"
177 "psllq $16, %%mm5 \n\t"
178 "por %%mm3, %%mm1 \n\t"
179 "pand %6, %%mm5 \n\t"
180 "por %%mm5, %%mm4 \n\t"
182 MOVNTQ" %%mm0, %0 \n\t"
183 MOVNTQ" %%mm1, 8%0 \n\t"
184 MOVNTQ" %%mm4, 16%0"
185 :"=m"(*dest)
186 :"m"(*s),"m"(mask24l),
187 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
188 :"memory");
189 dest += 24;
190 s += 32;
192 __asm__ volatile(SFENCE:::"memory");
193 __asm__ volatile(EMMS:::"memory");
194 #endif
195 while (s < end) {
196 #if HAVE_BIGENDIAN
197 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
198 s++;
199 dest[2] = *s++;
200 dest[1] = *s++;
201 dest[0] = *s++;
202 dest += 3;
203 #else
204 *dest++ = *s++;
205 *dest++ = *s++;
206 *dest++ = *s++;
207 s++;
208 #endif
213 original by Strepto/Astral
214 ported to gcc & bugfixed: A'rpi
215 MMX2, 3DNOW optimization by Nick Kurshev
216 32-bit C version, and and&add trick by Michael Niedermayer
218 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
220 register const uint8_t* s=src;
221 register uint8_t* d=dst;
222 register const uint8_t *end;
223 const uint8_t *mm_end;
224 end = s + src_size;
225 #if HAVE_MMX
226 __asm__ volatile(PREFETCH" %0"::"m"(*s));
227 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
228 mm_end = end - 15;
229 while (s<mm_end) {
230 __asm__ volatile(
231 PREFETCH" 32%1 \n\t"
232 "movq %1, %%mm0 \n\t"
233 "movq 8%1, %%mm2 \n\t"
234 "movq %%mm0, %%mm1 \n\t"
235 "movq %%mm2, %%mm3 \n\t"
236 "pand %%mm4, %%mm0 \n\t"
237 "pand %%mm4, %%mm2 \n\t"
238 "paddw %%mm1, %%mm0 \n\t"
239 "paddw %%mm3, %%mm2 \n\t"
240 MOVNTQ" %%mm0, %0 \n\t"
241 MOVNTQ" %%mm2, 8%0"
242 :"=m"(*d)
243 :"m"(*s)
245 d+=16;
246 s+=16;
248 __asm__ volatile(SFENCE:::"memory");
249 __asm__ volatile(EMMS:::"memory");
250 #endif
251 mm_end = end - 3;
252 while (s < mm_end) {
253 register unsigned x= *((const uint32_t *)s);
254 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
255 d+=4;
256 s+=4;
258 if (s < end) {
259 register unsigned short x= *((const uint16_t *)s);
260 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
264 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
266 register const uint8_t* s=src;
267 register uint8_t* d=dst;
268 register const uint8_t *end;
269 const uint8_t *mm_end;
270 end = s + src_size;
271 #if HAVE_MMX
272 __asm__ volatile(PREFETCH" %0"::"m"(*s));
273 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
274 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
275 mm_end = end - 15;
276 while (s<mm_end) {
277 __asm__ volatile(
278 PREFETCH" 32%1 \n\t"
279 "movq %1, %%mm0 \n\t"
280 "movq 8%1, %%mm2 \n\t"
281 "movq %%mm0, %%mm1 \n\t"
282 "movq %%mm2, %%mm3 \n\t"
283 "psrlq $1, %%mm0 \n\t"
284 "psrlq $1, %%mm2 \n\t"
285 "pand %%mm7, %%mm0 \n\t"
286 "pand %%mm7, %%mm2 \n\t"
287 "pand %%mm6, %%mm1 \n\t"
288 "pand %%mm6, %%mm3 \n\t"
289 "por %%mm1, %%mm0 \n\t"
290 "por %%mm3, %%mm2 \n\t"
291 MOVNTQ" %%mm0, %0 \n\t"
292 MOVNTQ" %%mm2, 8%0"
293 :"=m"(*d)
294 :"m"(*s)
296 d+=16;
297 s+=16;
299 __asm__ volatile(SFENCE:::"memory");
300 __asm__ volatile(EMMS:::"memory");
301 #endif
302 mm_end = end - 3;
303 while (s < mm_end) {
304 register uint32_t x= *((const uint32_t*)s);
305 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
306 s+=4;
307 d+=4;
309 if (s < end) {
310 register uint16_t x= *((const uint16_t*)s);
311 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
317 const uint8_t *s = src;
318 const uint8_t *end;
319 #if HAVE_MMX
320 const uint8_t *mm_end;
321 #endif
322 uint16_t *d = (uint16_t *)dst;
323 end = s + src_size;
324 #if HAVE_MMX
325 mm_end = end - 15;
326 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
327 __asm__ volatile(
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
331 "jmp 2f \n\t"
332 ASMALIGN(4)
333 "1: \n\t"
334 PREFETCH" 32(%1) \n\t"
335 "movd (%1), %%mm0 \n\t"
336 "movd 4(%1), %%mm3 \n\t"
337 "punpckldq 8(%1), %%mm0 \n\t"
338 "punpckldq 12(%1), %%mm3 \n\t"
339 "movq %%mm0, %%mm1 \n\t"
340 "movq %%mm3, %%mm4 \n\t"
341 "pand %%mm6, %%mm0 \n\t"
342 "pand %%mm6, %%mm3 \n\t"
343 "pmaddwd %%mm7, %%mm0 \n\t"
344 "pmaddwd %%mm7, %%mm3 \n\t"
345 "pand %%mm5, %%mm1 \n\t"
346 "pand %%mm5, %%mm4 \n\t"
347 "por %%mm1, %%mm0 \n\t"
348 "por %%mm4, %%mm3 \n\t"
349 "psrld $5, %%mm0 \n\t"
350 "pslld $11, %%mm3 \n\t"
351 "por %%mm3, %%mm0 \n\t"
352 MOVNTQ" %%mm0, (%0) \n\t"
353 "add $16, %1 \n\t"
354 "add $8, %0 \n\t"
355 "2: \n\t"
356 "cmp %2, %1 \n\t"
357 " jb 1b \n\t"
358 : "+r" (d), "+r"(s)
359 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
361 #else
362 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
363 __asm__ volatile(
364 "movq %0, %%mm7 \n\t"
365 "movq %1, %%mm6 \n\t"
366 ::"m"(red_16mask),"m"(green_16mask));
367 while (s < mm_end) {
368 __asm__ volatile(
369 PREFETCH" 32%1 \n\t"
370 "movd %1, %%mm0 \n\t"
371 "movd 4%1, %%mm3 \n\t"
372 "punpckldq 8%1, %%mm0 \n\t"
373 "punpckldq 12%1, %%mm3 \n\t"
374 "movq %%mm0, %%mm1 \n\t"
375 "movq %%mm0, %%mm2 \n\t"
376 "movq %%mm3, %%mm4 \n\t"
377 "movq %%mm3, %%mm5 \n\t"
378 "psrlq $3, %%mm0 \n\t"
379 "psrlq $3, %%mm3 \n\t"
380 "pand %2, %%mm0 \n\t"
381 "pand %2, %%mm3 \n\t"
382 "psrlq $5, %%mm1 \n\t"
383 "psrlq $5, %%mm4 \n\t"
384 "pand %%mm6, %%mm1 \n\t"
385 "pand %%mm6, %%mm4 \n\t"
386 "psrlq $8, %%mm2 \n\t"
387 "psrlq $8, %%mm5 \n\t"
388 "pand %%mm7, %%mm2 \n\t"
389 "pand %%mm7, %%mm5 \n\t"
390 "por %%mm1, %%mm0 \n\t"
391 "por %%mm4, %%mm3 \n\t"
392 "por %%mm2, %%mm0 \n\t"
393 "por %%mm5, %%mm3 \n\t"
394 "psllq $16, %%mm3 \n\t"
395 "por %%mm3, %%mm0 \n\t"
396 MOVNTQ" %%mm0, %0 \n\t"
397 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
398 d += 4;
399 s += 16;
401 #endif
402 __asm__ volatile(SFENCE:::"memory");
403 __asm__ volatile(EMMS:::"memory");
404 #endif
405 while (s < end) {
406 register int rgb = *(const uint32_t*)s; s += 4;
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
413 const uint8_t *s = src;
414 const uint8_t *end;
415 #if HAVE_MMX
416 const uint8_t *mm_end;
417 #endif
418 uint16_t *d = (uint16_t *)dst;
419 end = s + src_size;
420 #if HAVE_MMX
421 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
422 __asm__ volatile(
423 "movq %0, %%mm7 \n\t"
424 "movq %1, %%mm6 \n\t"
425 ::"m"(red_16mask),"m"(green_16mask));
426 mm_end = end - 15;
427 while (s < mm_end) {
428 __asm__ volatile(
429 PREFETCH" 32%1 \n\t"
430 "movd %1, %%mm0 \n\t"
431 "movd 4%1, %%mm3 \n\t"
432 "punpckldq 8%1, %%mm0 \n\t"
433 "punpckldq 12%1, %%mm3 \n\t"
434 "movq %%mm0, %%mm1 \n\t"
435 "movq %%mm0, %%mm2 \n\t"
436 "movq %%mm3, %%mm4 \n\t"
437 "movq %%mm3, %%mm5 \n\t"
438 "psllq $8, %%mm0 \n\t"
439 "psllq $8, %%mm3 \n\t"
440 "pand %%mm7, %%mm0 \n\t"
441 "pand %%mm7, %%mm3 \n\t"
442 "psrlq $5, %%mm1 \n\t"
443 "psrlq $5, %%mm4 \n\t"
444 "pand %%mm6, %%mm1 \n\t"
445 "pand %%mm6, %%mm4 \n\t"
446 "psrlq $19, %%mm2 \n\t"
447 "psrlq $19, %%mm5 \n\t"
448 "pand %2, %%mm2 \n\t"
449 "pand %2, %%mm5 \n\t"
450 "por %%mm1, %%mm0 \n\t"
451 "por %%mm4, %%mm3 \n\t"
452 "por %%mm2, %%mm0 \n\t"
453 "por %%mm5, %%mm3 \n\t"
454 "psllq $16, %%mm3 \n\t"
455 "por %%mm3, %%mm0 \n\t"
456 MOVNTQ" %%mm0, %0 \n\t"
457 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
458 d += 4;
459 s += 16;
461 __asm__ volatile(SFENCE:::"memory");
462 __asm__ volatile(EMMS:::"memory");
463 #endif
464 while (s < end) {
465 register int rgb = *(const uint32_t*)s; s += 4;
466 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
470 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
472 const uint8_t *s = src;
473 const uint8_t *end;
474 #if HAVE_MMX
475 const uint8_t *mm_end;
476 #endif
477 uint16_t *d = (uint16_t *)dst;
478 end = s + src_size;
479 #if HAVE_MMX
480 mm_end = end - 15;
481 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
482 __asm__ volatile(
483 "movq %3, %%mm5 \n\t"
484 "movq %4, %%mm6 \n\t"
485 "movq %5, %%mm7 \n\t"
486 "jmp 2f \n\t"
487 ASMALIGN(4)
488 "1: \n\t"
489 PREFETCH" 32(%1) \n\t"
490 "movd (%1), %%mm0 \n\t"
491 "movd 4(%1), %%mm3 \n\t"
492 "punpckldq 8(%1), %%mm0 \n\t"
493 "punpckldq 12(%1), %%mm3 \n\t"
494 "movq %%mm0, %%mm1 \n\t"
495 "movq %%mm3, %%mm4 \n\t"
496 "pand %%mm6, %%mm0 \n\t"
497 "pand %%mm6, %%mm3 \n\t"
498 "pmaddwd %%mm7, %%mm0 \n\t"
499 "pmaddwd %%mm7, %%mm3 \n\t"
500 "pand %%mm5, %%mm1 \n\t"
501 "pand %%mm5, %%mm4 \n\t"
502 "por %%mm1, %%mm0 \n\t"
503 "por %%mm4, %%mm3 \n\t"
504 "psrld $6, %%mm0 \n\t"
505 "pslld $10, %%mm3 \n\t"
506 "por %%mm3, %%mm0 \n\t"
507 MOVNTQ" %%mm0, (%0) \n\t"
508 "add $16, %1 \n\t"
509 "add $8, %0 \n\t"
510 "2: \n\t"
511 "cmp %2, %1 \n\t"
512 " jb 1b \n\t"
513 : "+r" (d), "+r"(s)
514 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
516 #else
517 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
518 __asm__ volatile(
519 "movq %0, %%mm7 \n\t"
520 "movq %1, %%mm6 \n\t"
521 ::"m"(red_15mask),"m"(green_15mask));
522 while (s < mm_end) {
523 __asm__ volatile(
524 PREFETCH" 32%1 \n\t"
525 "movd %1, %%mm0 \n\t"
526 "movd 4%1, %%mm3 \n\t"
527 "punpckldq 8%1, %%mm0 \n\t"
528 "punpckldq 12%1, %%mm3 \n\t"
529 "movq %%mm0, %%mm1 \n\t"
530 "movq %%mm0, %%mm2 \n\t"
531 "movq %%mm3, %%mm4 \n\t"
532 "movq %%mm3, %%mm5 \n\t"
533 "psrlq $3, %%mm0 \n\t"
534 "psrlq $3, %%mm3 \n\t"
535 "pand %2, %%mm0 \n\t"
536 "pand %2, %%mm3 \n\t"
537 "psrlq $6, %%mm1 \n\t"
538 "psrlq $6, %%mm4 \n\t"
539 "pand %%mm6, %%mm1 \n\t"
540 "pand %%mm6, %%mm4 \n\t"
541 "psrlq $9, %%mm2 \n\t"
542 "psrlq $9, %%mm5 \n\t"
543 "pand %%mm7, %%mm2 \n\t"
544 "pand %%mm7, %%mm5 \n\t"
545 "por %%mm1, %%mm0 \n\t"
546 "por %%mm4, %%mm3 \n\t"
547 "por %%mm2, %%mm0 \n\t"
548 "por %%mm5, %%mm3 \n\t"
549 "psllq $16, %%mm3 \n\t"
550 "por %%mm3, %%mm0 \n\t"
551 MOVNTQ" %%mm0, %0 \n\t"
552 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553 d += 4;
554 s += 16;
556 #endif
557 __asm__ volatile(SFENCE:::"memory");
558 __asm__ volatile(EMMS:::"memory");
559 #endif
560 while (s < end) {
561 register int rgb = *(const uint32_t*)s; s += 4;
562 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
566 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
568 const uint8_t *s = src;
569 const uint8_t *end;
570 #if HAVE_MMX
571 const uint8_t *mm_end;
572 #endif
573 uint16_t *d = (uint16_t *)dst;
574 end = s + src_size;
575 #if HAVE_MMX
576 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
577 __asm__ volatile(
578 "movq %0, %%mm7 \n\t"
579 "movq %1, %%mm6 \n\t"
580 ::"m"(red_15mask),"m"(green_15mask));
581 mm_end = end - 15;
582 while (s < mm_end) {
583 __asm__ volatile(
584 PREFETCH" 32%1 \n\t"
585 "movd %1, %%mm0 \n\t"
586 "movd 4%1, %%mm3 \n\t"
587 "punpckldq 8%1, %%mm0 \n\t"
588 "punpckldq 12%1, %%mm3 \n\t"
589 "movq %%mm0, %%mm1 \n\t"
590 "movq %%mm0, %%mm2 \n\t"
591 "movq %%mm3, %%mm4 \n\t"
592 "movq %%mm3, %%mm5 \n\t"
593 "psllq $7, %%mm0 \n\t"
594 "psllq $7, %%mm3 \n\t"
595 "pand %%mm7, %%mm0 \n\t"
596 "pand %%mm7, %%mm3 \n\t"
597 "psrlq $6, %%mm1 \n\t"
598 "psrlq $6, %%mm4 \n\t"
599 "pand %%mm6, %%mm1 \n\t"
600 "pand %%mm6, %%mm4 \n\t"
601 "psrlq $19, %%mm2 \n\t"
602 "psrlq $19, %%mm5 \n\t"
603 "pand %2, %%mm2 \n\t"
604 "pand %2, %%mm5 \n\t"
605 "por %%mm1, %%mm0 \n\t"
606 "por %%mm4, %%mm3 \n\t"
607 "por %%mm2, %%mm0 \n\t"
608 "por %%mm5, %%mm3 \n\t"
609 "psllq $16, %%mm3 \n\t"
610 "por %%mm3, %%mm0 \n\t"
611 MOVNTQ" %%mm0, %0 \n\t"
612 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
613 d += 4;
614 s += 16;
616 __asm__ volatile(SFENCE:::"memory");
617 __asm__ volatile(EMMS:::"memory");
618 #endif
619 while (s < end) {
620 register int rgb = *(const uint32_t*)s; s += 4;
621 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
625 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
627 const uint8_t *s = src;
628 const uint8_t *end;
629 #if HAVE_MMX
630 const uint8_t *mm_end;
631 #endif
632 uint16_t *d = (uint16_t *)dst;
633 end = s + src_size;
634 #if HAVE_MMX
635 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
636 __asm__ volatile(
637 "movq %0, %%mm7 \n\t"
638 "movq %1, %%mm6 \n\t"
639 ::"m"(red_16mask),"m"(green_16mask));
640 mm_end = end - 11;
641 while (s < mm_end) {
642 __asm__ volatile(
643 PREFETCH" 32%1 \n\t"
644 "movd %1, %%mm0 \n\t"
645 "movd 3%1, %%mm3 \n\t"
646 "punpckldq 6%1, %%mm0 \n\t"
647 "punpckldq 9%1, %%mm3 \n\t"
648 "movq %%mm0, %%mm1 \n\t"
649 "movq %%mm0, %%mm2 \n\t"
650 "movq %%mm3, %%mm4 \n\t"
651 "movq %%mm3, %%mm5 \n\t"
652 "psrlq $3, %%mm0 \n\t"
653 "psrlq $3, %%mm3 \n\t"
654 "pand %2, %%mm0 \n\t"
655 "pand %2, %%mm3 \n\t"
656 "psrlq $5, %%mm1 \n\t"
657 "psrlq $5, %%mm4 \n\t"
658 "pand %%mm6, %%mm1 \n\t"
659 "pand %%mm6, %%mm4 \n\t"
660 "psrlq $8, %%mm2 \n\t"
661 "psrlq $8, %%mm5 \n\t"
662 "pand %%mm7, %%mm2 \n\t"
663 "pand %%mm7, %%mm5 \n\t"
664 "por %%mm1, %%mm0 \n\t"
665 "por %%mm4, %%mm3 \n\t"
666 "por %%mm2, %%mm0 \n\t"
667 "por %%mm5, %%mm3 \n\t"
668 "psllq $16, %%mm3 \n\t"
669 "por %%mm3, %%mm0 \n\t"
670 MOVNTQ" %%mm0, %0 \n\t"
671 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
672 d += 4;
673 s += 12;
675 __asm__ volatile(SFENCE:::"memory");
676 __asm__ volatile(EMMS:::"memory");
677 #endif
678 while (s < end) {
679 const int b = *s++;
680 const int g = *s++;
681 const int r = *s++;
682 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
686 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
688 const uint8_t *s = src;
689 const uint8_t *end;
690 #if HAVE_MMX
691 const uint8_t *mm_end;
692 #endif
693 uint16_t *d = (uint16_t *)dst;
694 end = s + src_size;
695 #if HAVE_MMX
696 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
697 __asm__ volatile(
698 "movq %0, %%mm7 \n\t"
699 "movq %1, %%mm6 \n\t"
700 ::"m"(red_16mask),"m"(green_16mask));
701 mm_end = end - 15;
702 while (s < mm_end) {
703 __asm__ volatile(
704 PREFETCH" 32%1 \n\t"
705 "movd %1, %%mm0 \n\t"
706 "movd 3%1, %%mm3 \n\t"
707 "punpckldq 6%1, %%mm0 \n\t"
708 "punpckldq 9%1, %%mm3 \n\t"
709 "movq %%mm0, %%mm1 \n\t"
710 "movq %%mm0, %%mm2 \n\t"
711 "movq %%mm3, %%mm4 \n\t"
712 "movq %%mm3, %%mm5 \n\t"
713 "psllq $8, %%mm0 \n\t"
714 "psllq $8, %%mm3 \n\t"
715 "pand %%mm7, %%mm0 \n\t"
716 "pand %%mm7, %%mm3 \n\t"
717 "psrlq $5, %%mm1 \n\t"
718 "psrlq $5, %%mm4 \n\t"
719 "pand %%mm6, %%mm1 \n\t"
720 "pand %%mm6, %%mm4 \n\t"
721 "psrlq $19, %%mm2 \n\t"
722 "psrlq $19, %%mm5 \n\t"
723 "pand %2, %%mm2 \n\t"
724 "pand %2, %%mm5 \n\t"
725 "por %%mm1, %%mm0 \n\t"
726 "por %%mm4, %%mm3 \n\t"
727 "por %%mm2, %%mm0 \n\t"
728 "por %%mm5, %%mm3 \n\t"
729 "psllq $16, %%mm3 \n\t"
730 "por %%mm3, %%mm0 \n\t"
731 MOVNTQ" %%mm0, %0 \n\t"
732 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
733 d += 4;
734 s += 12;
736 __asm__ volatile(SFENCE:::"memory");
737 __asm__ volatile(EMMS:::"memory");
738 #endif
739 while (s < end) {
740 const int r = *s++;
741 const int g = *s++;
742 const int b = *s++;
743 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
747 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
749 const uint8_t *s = src;
750 const uint8_t *end;
751 #if HAVE_MMX
752 const uint8_t *mm_end;
753 #endif
754 uint16_t *d = (uint16_t *)dst;
755 end = s + src_size;
756 #if HAVE_MMX
757 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
758 __asm__ volatile(
759 "movq %0, %%mm7 \n\t"
760 "movq %1, %%mm6 \n\t"
761 ::"m"(red_15mask),"m"(green_15mask));
762 mm_end = end - 11;
763 while (s < mm_end) {
764 __asm__ volatile(
765 PREFETCH" 32%1 \n\t"
766 "movd %1, %%mm0 \n\t"
767 "movd 3%1, %%mm3 \n\t"
768 "punpckldq 6%1, %%mm0 \n\t"
769 "punpckldq 9%1, %%mm3 \n\t"
770 "movq %%mm0, %%mm1 \n\t"
771 "movq %%mm0, %%mm2 \n\t"
772 "movq %%mm3, %%mm4 \n\t"
773 "movq %%mm3, %%mm5 \n\t"
774 "psrlq $3, %%mm0 \n\t"
775 "psrlq $3, %%mm3 \n\t"
776 "pand %2, %%mm0 \n\t"
777 "pand %2, %%mm3 \n\t"
778 "psrlq $6, %%mm1 \n\t"
779 "psrlq $6, %%mm4 \n\t"
780 "pand %%mm6, %%mm1 \n\t"
781 "pand %%mm6, %%mm4 \n\t"
782 "psrlq $9, %%mm2 \n\t"
783 "psrlq $9, %%mm5 \n\t"
784 "pand %%mm7, %%mm2 \n\t"
785 "pand %%mm7, %%mm5 \n\t"
786 "por %%mm1, %%mm0 \n\t"
787 "por %%mm4, %%mm3 \n\t"
788 "por %%mm2, %%mm0 \n\t"
789 "por %%mm5, %%mm3 \n\t"
790 "psllq $16, %%mm3 \n\t"
791 "por %%mm3, %%mm0 \n\t"
792 MOVNTQ" %%mm0, %0 \n\t"
793 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
794 d += 4;
795 s += 12;
797 __asm__ volatile(SFENCE:::"memory");
798 __asm__ volatile(EMMS:::"memory");
799 #endif
800 while (s < end) {
801 const int b = *s++;
802 const int g = *s++;
803 const int r = *s++;
804 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
808 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
810 const uint8_t *s = src;
811 const uint8_t *end;
812 #if HAVE_MMX
813 const uint8_t *mm_end;
814 #endif
815 uint16_t *d = (uint16_t *)dst;
816 end = s + src_size;
817 #if HAVE_MMX
818 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
819 __asm__ volatile(
820 "movq %0, %%mm7 \n\t"
821 "movq %1, %%mm6 \n\t"
822 ::"m"(red_15mask),"m"(green_15mask));
823 mm_end = end - 15;
824 while (s < mm_end) {
825 __asm__ volatile(
826 PREFETCH" 32%1 \n\t"
827 "movd %1, %%mm0 \n\t"
828 "movd 3%1, %%mm3 \n\t"
829 "punpckldq 6%1, %%mm0 \n\t"
830 "punpckldq 9%1, %%mm3 \n\t"
831 "movq %%mm0, %%mm1 \n\t"
832 "movq %%mm0, %%mm2 \n\t"
833 "movq %%mm3, %%mm4 \n\t"
834 "movq %%mm3, %%mm5 \n\t"
835 "psllq $7, %%mm0 \n\t"
836 "psllq $7, %%mm3 \n\t"
837 "pand %%mm7, %%mm0 \n\t"
838 "pand %%mm7, %%mm3 \n\t"
839 "psrlq $6, %%mm1 \n\t"
840 "psrlq $6, %%mm4 \n\t"
841 "pand %%mm6, %%mm1 \n\t"
842 "pand %%mm6, %%mm4 \n\t"
843 "psrlq $19, %%mm2 \n\t"
844 "psrlq $19, %%mm5 \n\t"
845 "pand %2, %%mm2 \n\t"
846 "pand %2, %%mm5 \n\t"
847 "por %%mm1, %%mm0 \n\t"
848 "por %%mm4, %%mm3 \n\t"
849 "por %%mm2, %%mm0 \n\t"
850 "por %%mm5, %%mm3 \n\t"
851 "psllq $16, %%mm3 \n\t"
852 "por %%mm3, %%mm0 \n\t"
853 MOVNTQ" %%mm0, %0 \n\t"
854 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 d += 4;
856 s += 12;
858 __asm__ volatile(SFENCE:::"memory");
859 __asm__ volatile(EMMS:::"memory");
860 #endif
861 while (s < end) {
862 const int r = *s++;
863 const int g = *s++;
864 const int b = *s++;
865 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
870 I use less accurate approximation here by simply left-shifting the input
871 value and filling the low order bits with zeroes. This method improves PNG
872 compression but this scheme cannot reproduce white exactly, since it does
873 not generate an all-ones maximum value; the net effect is to darken the
874 image slightly.
876 The better method should be "left bit replication":
878 4 3 2 1 0
879 ---------
880 1 1 0 1 1
882 7 6 5 4 3 2 1 0
883 ----------------
884 1 1 0 1 1 1 1 0
885 |=======| |===|
886 | leftmost bits repeated to fill open bits
888 original bits
890 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
892 const uint16_t *end;
893 #if HAVE_MMX
894 const uint16_t *mm_end;
895 #endif
896 uint8_t *d = dst;
897 const uint16_t *s = (const uint16_t*)src;
898 end = s + src_size/2;
899 #if HAVE_MMX
900 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
901 mm_end = end - 7;
902 while (s < mm_end) {
903 __asm__ volatile(
904 PREFETCH" 32%1 \n\t"
905 "movq %1, %%mm0 \n\t"
906 "movq %1, %%mm1 \n\t"
907 "movq %1, %%mm2 \n\t"
908 "pand %2, %%mm0 \n\t"
909 "pand %3, %%mm1 \n\t"
910 "pand %4, %%mm2 \n\t"
911 "psllq $3, %%mm0 \n\t"
912 "psrlq $2, %%mm1 \n\t"
913 "psrlq $7, %%mm2 \n\t"
914 "movq %%mm0, %%mm3 \n\t"
915 "movq %%mm1, %%mm4 \n\t"
916 "movq %%mm2, %%mm5 \n\t"
917 "punpcklwd %5, %%mm0 \n\t"
918 "punpcklwd %5, %%mm1 \n\t"
919 "punpcklwd %5, %%mm2 \n\t"
920 "punpckhwd %5, %%mm3 \n\t"
921 "punpckhwd %5, %%mm4 \n\t"
922 "punpckhwd %5, %%mm5 \n\t"
923 "psllq $8, %%mm1 \n\t"
924 "psllq $16, %%mm2 \n\t"
925 "por %%mm1, %%mm0 \n\t"
926 "por %%mm2, %%mm0 \n\t"
927 "psllq $8, %%mm4 \n\t"
928 "psllq $16, %%mm5 \n\t"
929 "por %%mm4, %%mm3 \n\t"
930 "por %%mm5, %%mm3 \n\t"
932 "movq %%mm0, %%mm6 \n\t"
933 "movq %%mm3, %%mm7 \n\t"
935 "movq 8%1, %%mm0 \n\t"
936 "movq 8%1, %%mm1 \n\t"
937 "movq 8%1, %%mm2 \n\t"
938 "pand %2, %%mm0 \n\t"
939 "pand %3, %%mm1 \n\t"
940 "pand %4, %%mm2 \n\t"
941 "psllq $3, %%mm0 \n\t"
942 "psrlq $2, %%mm1 \n\t"
943 "psrlq $7, %%mm2 \n\t"
944 "movq %%mm0, %%mm3 \n\t"
945 "movq %%mm1, %%mm4 \n\t"
946 "movq %%mm2, %%mm5 \n\t"
947 "punpcklwd %5, %%mm0 \n\t"
948 "punpcklwd %5, %%mm1 \n\t"
949 "punpcklwd %5, %%mm2 \n\t"
950 "punpckhwd %5, %%mm3 \n\t"
951 "punpckhwd %5, %%mm4 \n\t"
952 "punpckhwd %5, %%mm5 \n\t"
953 "psllq $8, %%mm1 \n\t"
954 "psllq $16, %%mm2 \n\t"
955 "por %%mm1, %%mm0 \n\t"
956 "por %%mm2, %%mm0 \n\t"
957 "psllq $8, %%mm4 \n\t"
958 "psllq $16, %%mm5 \n\t"
959 "por %%mm4, %%mm3 \n\t"
960 "por %%mm5, %%mm3 \n\t"
962 :"=m"(*d)
963 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
964 :"memory");
965 /* borrowed 32 to 24 */
966 __asm__ volatile(
967 "movq %%mm0, %%mm4 \n\t"
968 "movq %%mm3, %%mm5 \n\t"
969 "movq %%mm6, %%mm0 \n\t"
970 "movq %%mm7, %%mm1 \n\t"
972 "movq %%mm4, %%mm6 \n\t"
973 "movq %%mm5, %%mm7 \n\t"
974 "movq %%mm0, %%mm2 \n\t"
975 "movq %%mm1, %%mm3 \n\t"
977 "psrlq $8, %%mm2 \n\t"
978 "psrlq $8, %%mm3 \n\t"
979 "psrlq $8, %%mm6 \n\t"
980 "psrlq $8, %%mm7 \n\t"
981 "pand %2, %%mm0 \n\t"
982 "pand %2, %%mm1 \n\t"
983 "pand %2, %%mm4 \n\t"
984 "pand %2, %%mm5 \n\t"
985 "pand %3, %%mm2 \n\t"
986 "pand %3, %%mm3 \n\t"
987 "pand %3, %%mm6 \n\t"
988 "pand %3, %%mm7 \n\t"
989 "por %%mm2, %%mm0 \n\t"
990 "por %%mm3, %%mm1 \n\t"
991 "por %%mm6, %%mm4 \n\t"
992 "por %%mm7, %%mm5 \n\t"
994 "movq %%mm1, %%mm2 \n\t"
995 "movq %%mm4, %%mm3 \n\t"
996 "psllq $48, %%mm2 \n\t"
997 "psllq $32, %%mm3 \n\t"
998 "pand %4, %%mm2 \n\t"
999 "pand %5, %%mm3 \n\t"
1000 "por %%mm2, %%mm0 \n\t"
1001 "psrlq $16, %%mm1 \n\t"
1002 "psrlq $32, %%mm4 \n\t"
1003 "psllq $16, %%mm5 \n\t"
1004 "por %%mm3, %%mm1 \n\t"
1005 "pand %6, %%mm5 \n\t"
1006 "por %%mm5, %%mm4 \n\t"
1008 MOVNTQ" %%mm0, %0 \n\t"
1009 MOVNTQ" %%mm1, 8%0 \n\t"
1010 MOVNTQ" %%mm4, 16%0"
1012 :"=m"(*d)
1013 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1014 :"memory");
1015 d += 24;
1016 s += 8;
1018 __asm__ volatile(SFENCE:::"memory");
1019 __asm__ volatile(EMMS:::"memory");
1020 #endif
1021 while (s < end) {
1022 register uint16_t bgr;
1023 bgr = *s++;
1024 *d++ = (bgr&0x1F)<<3;
1025 *d++ = (bgr&0x3E0)>>2;
1026 *d++ = (bgr&0x7C00)>>7;
1030 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1032 const uint16_t *end;
1033 #if HAVE_MMX
1034 const uint16_t *mm_end;
1035 #endif
1036 uint8_t *d = (uint8_t *)dst;
1037 const uint16_t *s = (const uint16_t *)src;
1038 end = s + src_size/2;
1039 #if HAVE_MMX
1040 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1041 mm_end = end - 7;
1042 while (s < mm_end) {
1043 __asm__ volatile(
1044 PREFETCH" 32%1 \n\t"
1045 "movq %1, %%mm0 \n\t"
1046 "movq %1, %%mm1 \n\t"
1047 "movq %1, %%mm2 \n\t"
1048 "pand %2, %%mm0 \n\t"
1049 "pand %3, %%mm1 \n\t"
1050 "pand %4, %%mm2 \n\t"
1051 "psllq $3, %%mm0 \n\t"
1052 "psrlq $3, %%mm1 \n\t"
1053 "psrlq $8, %%mm2 \n\t"
1054 "movq %%mm0, %%mm3 \n\t"
1055 "movq %%mm1, %%mm4 \n\t"
1056 "movq %%mm2, %%mm5 \n\t"
1057 "punpcklwd %5, %%mm0 \n\t"
1058 "punpcklwd %5, %%mm1 \n\t"
1059 "punpcklwd %5, %%mm2 \n\t"
1060 "punpckhwd %5, %%mm3 \n\t"
1061 "punpckhwd %5, %%mm4 \n\t"
1062 "punpckhwd %5, %%mm5 \n\t"
1063 "psllq $8, %%mm1 \n\t"
1064 "psllq $16, %%mm2 \n\t"
1065 "por %%mm1, %%mm0 \n\t"
1066 "por %%mm2, %%mm0 \n\t"
1067 "psllq $8, %%mm4 \n\t"
1068 "psllq $16, %%mm5 \n\t"
1069 "por %%mm4, %%mm3 \n\t"
1070 "por %%mm5, %%mm3 \n\t"
1072 "movq %%mm0, %%mm6 \n\t"
1073 "movq %%mm3, %%mm7 \n\t"
1075 "movq 8%1, %%mm0 \n\t"
1076 "movq 8%1, %%mm1 \n\t"
1077 "movq 8%1, %%mm2 \n\t"
1078 "pand %2, %%mm0 \n\t"
1079 "pand %3, %%mm1 \n\t"
1080 "pand %4, %%mm2 \n\t"
1081 "psllq $3, %%mm0 \n\t"
1082 "psrlq $3, %%mm1 \n\t"
1083 "psrlq $8, %%mm2 \n\t"
1084 "movq %%mm0, %%mm3 \n\t"
1085 "movq %%mm1, %%mm4 \n\t"
1086 "movq %%mm2, %%mm5 \n\t"
1087 "punpcklwd %5, %%mm0 \n\t"
1088 "punpcklwd %5, %%mm1 \n\t"
1089 "punpcklwd %5, %%mm2 \n\t"
1090 "punpckhwd %5, %%mm3 \n\t"
1091 "punpckhwd %5, %%mm4 \n\t"
1092 "punpckhwd %5, %%mm5 \n\t"
1093 "psllq $8, %%mm1 \n\t"
1094 "psllq $16, %%mm2 \n\t"
1095 "por %%mm1, %%mm0 \n\t"
1096 "por %%mm2, %%mm0 \n\t"
1097 "psllq $8, %%mm4 \n\t"
1098 "psllq $16, %%mm5 \n\t"
1099 "por %%mm4, %%mm3 \n\t"
1100 "por %%mm5, %%mm3 \n\t"
1101 :"=m"(*d)
1102 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1103 :"memory");
1104 /* borrowed 32 to 24 */
1105 __asm__ volatile(
1106 "movq %%mm0, %%mm4 \n\t"
1107 "movq %%mm3, %%mm5 \n\t"
1108 "movq %%mm6, %%mm0 \n\t"
1109 "movq %%mm7, %%mm1 \n\t"
1111 "movq %%mm4, %%mm6 \n\t"
1112 "movq %%mm5, %%mm7 \n\t"
1113 "movq %%mm0, %%mm2 \n\t"
1114 "movq %%mm1, %%mm3 \n\t"
1116 "psrlq $8, %%mm2 \n\t"
1117 "psrlq $8, %%mm3 \n\t"
1118 "psrlq $8, %%mm6 \n\t"
1119 "psrlq $8, %%mm7 \n\t"
1120 "pand %2, %%mm0 \n\t"
1121 "pand %2, %%mm1 \n\t"
1122 "pand %2, %%mm4 \n\t"
1123 "pand %2, %%mm5 \n\t"
1124 "pand %3, %%mm2 \n\t"
1125 "pand %3, %%mm3 \n\t"
1126 "pand %3, %%mm6 \n\t"
1127 "pand %3, %%mm7 \n\t"
1128 "por %%mm2, %%mm0 \n\t"
1129 "por %%mm3, %%mm1 \n\t"
1130 "por %%mm6, %%mm4 \n\t"
1131 "por %%mm7, %%mm5 \n\t"
1133 "movq %%mm1, %%mm2 \n\t"
1134 "movq %%mm4, %%mm3 \n\t"
1135 "psllq $48, %%mm2 \n\t"
1136 "psllq $32, %%mm3 \n\t"
1137 "pand %4, %%mm2 \n\t"
1138 "pand %5, %%mm3 \n\t"
1139 "por %%mm2, %%mm0 \n\t"
1140 "psrlq $16, %%mm1 \n\t"
1141 "psrlq $32, %%mm4 \n\t"
1142 "psllq $16, %%mm5 \n\t"
1143 "por %%mm3, %%mm1 \n\t"
1144 "pand %6, %%mm5 \n\t"
1145 "por %%mm5, %%mm4 \n\t"
1147 MOVNTQ" %%mm0, %0 \n\t"
1148 MOVNTQ" %%mm1, 8%0 \n\t"
1149 MOVNTQ" %%mm4, 16%0"
1151 :"=m"(*d)
1152 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1153 :"memory");
1154 d += 24;
1155 s += 8;
1157 __asm__ volatile(SFENCE:::"memory");
1158 __asm__ volatile(EMMS:::"memory");
1159 #endif
1160 while (s < end) {
1161 register uint16_t bgr;
1162 bgr = *s++;
1163 *d++ = (bgr&0x1F)<<3;
1164 *d++ = (bgr&0x7E0)>>3;
1165 *d++ = (bgr&0xF800)>>8;
1170 * mm0 = 00 B3 00 B2 00 B1 00 B0
1171 * mm1 = 00 G3 00 G2 00 G1 00 G0
1172 * mm2 = 00 R3 00 R2 00 R1 00 R0
1173 * mm6 = FF FF FF FF FF FF FF FF
1174 * mm7 = 00 00 00 00 00 00 00 00
1176 #define PACK_RGB32 \
1177 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1178 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1179 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1180 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1181 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1182 "movq %%mm0, %%mm3 \n\t" \
1183 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1184 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1185 MOVNTQ" %%mm0, %0 \n\t" \
1186 MOVNTQ" %%mm3, 8%0 \n\t" \
1188 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1190 const uint16_t *end;
1191 #if HAVE_MMX
1192 const uint16_t *mm_end;
1193 #endif
1194 uint8_t *d = dst;
1195 const uint16_t *s = (const uint16_t *)src;
1196 end = s + src_size/2;
1197 #if HAVE_MMX
1198 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1199 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1200 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1201 mm_end = end - 3;
1202 while (s < mm_end) {
1203 __asm__ volatile(
1204 PREFETCH" 32%1 \n\t"
1205 "movq %1, %%mm0 \n\t"
1206 "movq %1, %%mm1 \n\t"
1207 "movq %1, %%mm2 \n\t"
1208 "pand %2, %%mm0 \n\t"
1209 "pand %3, %%mm1 \n\t"
1210 "pand %4, %%mm2 \n\t"
1211 "psllq $3, %%mm0 \n\t"
1212 "psrlq $2, %%mm1 \n\t"
1213 "psrlq $7, %%mm2 \n\t"
1214 PACK_RGB32
1215 :"=m"(*d)
1216 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1217 :"memory");
1218 d += 16;
1219 s += 4;
1221 __asm__ volatile(SFENCE:::"memory");
1222 __asm__ volatile(EMMS:::"memory");
1223 #endif
1224 while (s < end) {
1225 register uint16_t bgr;
1226 bgr = *s++;
1227 #if HAVE_BIGENDIAN
1228 *d++ = 255;
1229 *d++ = (bgr&0x7C00)>>7;
1230 *d++ = (bgr&0x3E0)>>2;
1231 *d++ = (bgr&0x1F)<<3;
1232 #else
1233 *d++ = (bgr&0x1F)<<3;
1234 *d++ = (bgr&0x3E0)>>2;
1235 *d++ = (bgr&0x7C00)>>7;
1236 *d++ = 255;
1237 #endif
1241 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1243 const uint16_t *end;
1244 #if HAVE_MMX
1245 const uint16_t *mm_end;
1246 #endif
1247 uint8_t *d = dst;
1248 const uint16_t *s = (const uint16_t*)src;
1249 end = s + src_size/2;
1250 #if HAVE_MMX
1251 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1252 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1253 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1254 mm_end = end - 3;
1255 while (s < mm_end) {
1256 __asm__ volatile(
1257 PREFETCH" 32%1 \n\t"
1258 "movq %1, %%mm0 \n\t"
1259 "movq %1, %%mm1 \n\t"
1260 "movq %1, %%mm2 \n\t"
1261 "pand %2, %%mm0 \n\t"
1262 "pand %3, %%mm1 \n\t"
1263 "pand %4, %%mm2 \n\t"
1264 "psllq $3, %%mm0 \n\t"
1265 "psrlq $3, %%mm1 \n\t"
1266 "psrlq $8, %%mm2 \n\t"
1267 PACK_RGB32
1268 :"=m"(*d)
1269 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1270 :"memory");
1271 d += 16;
1272 s += 4;
1274 __asm__ volatile(SFENCE:::"memory");
1275 __asm__ volatile(EMMS:::"memory");
1276 #endif
1277 while (s < end) {
1278 register uint16_t bgr;
1279 bgr = *s++;
1280 #if HAVE_BIGENDIAN
1281 *d++ = 255;
1282 *d++ = (bgr&0xF800)>>8;
1283 *d++ = (bgr&0x7E0)>>3;
1284 *d++ = (bgr&0x1F)<<3;
1285 #else
1286 *d++ = (bgr&0x1F)<<3;
1287 *d++ = (bgr&0x7E0)>>3;
1288 *d++ = (bgr&0xF800)>>8;
1289 *d++ = 255;
1290 #endif
1294 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1296 x86_reg idx = 15 - src_size;
1297 const uint8_t *s = src-idx;
1298 uint8_t *d = dst-idx;
1299 #if HAVE_MMX
1300 __asm__ volatile(
1301 "test %0, %0 \n\t"
1302 "jns 2f \n\t"
1303 PREFETCH" (%1, %0) \n\t"
1304 "movq %3, %%mm7 \n\t"
1305 "pxor %4, %%mm7 \n\t"
1306 "movq %%mm7, %%mm6 \n\t"
1307 "pxor %5, %%mm7 \n\t"
1308 ASMALIGN(4)
1309 "1: \n\t"
1310 PREFETCH" 32(%1, %0) \n\t"
1311 "movq (%1, %0), %%mm0 \n\t"
1312 "movq 8(%1, %0), %%mm1 \n\t"
1313 # if HAVE_MMX2
1314 "pshufw $177, %%mm0, %%mm3 \n\t"
1315 "pshufw $177, %%mm1, %%mm5 \n\t"
1316 "pand %%mm7, %%mm0 \n\t"
1317 "pand %%mm6, %%mm3 \n\t"
1318 "pand %%mm7, %%mm1 \n\t"
1319 "pand %%mm6, %%mm5 \n\t"
1320 "por %%mm3, %%mm0 \n\t"
1321 "por %%mm5, %%mm1 \n\t"
1322 # else
1323 "movq %%mm0, %%mm2 \n\t"
1324 "movq %%mm1, %%mm4 \n\t"
1325 "pand %%mm7, %%mm0 \n\t"
1326 "pand %%mm6, %%mm2 \n\t"
1327 "pand %%mm7, %%mm1 \n\t"
1328 "pand %%mm6, %%mm4 \n\t"
1329 "movq %%mm2, %%mm3 \n\t"
1330 "movq %%mm4, %%mm5 \n\t"
1331 "pslld $16, %%mm2 \n\t"
1332 "psrld $16, %%mm3 \n\t"
1333 "pslld $16, %%mm4 \n\t"
1334 "psrld $16, %%mm5 \n\t"
1335 "por %%mm2, %%mm0 \n\t"
1336 "por %%mm4, %%mm1 \n\t"
1337 "por %%mm3, %%mm0 \n\t"
1338 "por %%mm5, %%mm1 \n\t"
1339 # endif
1340 MOVNTQ" %%mm0, (%2, %0) \n\t"
1341 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1342 "add $16, %0 \n\t"
1343 "js 1b \n\t"
1344 SFENCE" \n\t"
1345 EMMS" \n\t"
1346 "2: \n\t"
1347 : "+&r"(idx)
1348 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1349 : "memory");
1350 #endif
1351 for (; idx<15; idx+=4) {
1352 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1353 v &= 0xff00ff;
1354 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1358 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1360 unsigned i;
1361 #if HAVE_MMX
1362 x86_reg mmx_size= 23 - src_size;
1363 __asm__ volatile (
1364 "test %%"REG_a", %%"REG_a" \n\t"
1365 "jns 2f \n\t"
1366 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1367 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1368 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1369 ASMALIGN(4)
1370 "1: \n\t"
1371 PREFETCH" 32(%1, %%"REG_a") \n\t"
1372 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1373 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1374 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1375 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1376 "pand %%mm5, %%mm0 \n\t"
1377 "pand %%mm6, %%mm1 \n\t"
1378 "pand %%mm7, %%mm2 \n\t"
1379 "por %%mm0, %%mm1 \n\t"
1380 "por %%mm2, %%mm1 \n\t"
1381 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1382 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1383 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1384 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1385 "pand %%mm7, %%mm0 \n\t"
1386 "pand %%mm5, %%mm1 \n\t"
1387 "pand %%mm6, %%mm2 \n\t"
1388 "por %%mm0, %%mm1 \n\t"
1389 "por %%mm2, %%mm1 \n\t"
1390 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1391 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1392 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1393 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1394 "pand %%mm6, %%mm0 \n\t"
1395 "pand %%mm7, %%mm1 \n\t"
1396 "pand %%mm5, %%mm2 \n\t"
1397 "por %%mm0, %%mm1 \n\t"
1398 "por %%mm2, %%mm1 \n\t"
1399 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1400 "add $24, %%"REG_a" \n\t"
1401 " js 1b \n\t"
1402 "2: \n\t"
1403 : "+a" (mmx_size)
1404 : "r" (src-mmx_size), "r"(dst-mmx_size)
1407 __asm__ volatile(SFENCE:::"memory");
1408 __asm__ volatile(EMMS:::"memory");
1410 if (mmx_size==23) return; //finished, was multiple of 8
1412 src+= src_size;
1413 dst+= src_size;
1414 src_size= 23-mmx_size;
1415 src-= src_size;
1416 dst-= src_size;
1417 #endif
1418 for (i=0; i<src_size; i+=3) {
1419 register uint8_t x;
1420 x = src[i + 2];
1421 dst[i + 1] = src[i + 1];
1422 dst[i + 2] = src[i + 0];
1423 dst[i + 0] = x;
1427 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1428 long width, long height,
1429 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1431 long y;
1432 const x86_reg chromWidth= width>>1;
1433 for (y=0; y<height; y++) {
1434 #if HAVE_MMX
1435 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1436 __asm__ volatile(
1437 "xor %%"REG_a", %%"REG_a" \n\t"
1438 ASMALIGN(4)
1439 "1: \n\t"
1440 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1441 PREFETCH" 32(%2, %%"REG_a") \n\t"
1442 PREFETCH" 32(%3, %%"REG_a") \n\t"
1443 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1444 "movq %%mm0, %%mm2 \n\t" // U(0)
1445 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1446 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1447 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1449 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1450 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1451 "movq %%mm3, %%mm4 \n\t" // Y(0)
1452 "movq %%mm5, %%mm6 \n\t" // Y(8)
1453 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1454 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1455 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1456 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1458 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1459 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1460 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1461 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1463 "add $8, %%"REG_a" \n\t"
1464 "cmp %4, %%"REG_a" \n\t"
1465 " jb 1b \n\t"
1466 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1467 : "%"REG_a
1469 #else
1471 #if ARCH_ALPHA && HAVE_MVI
1472 #define pl2yuy2(n) \
1473 y1 = yc[n]; \
1474 y2 = yc2[n]; \
1475 u = uc[n]; \
1476 v = vc[n]; \
1477 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1478 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1479 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1480 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1481 yuv1 = (u << 8) + (v << 24); \
1482 yuv2 = yuv1 + y2; \
1483 yuv1 += y1; \
1484 qdst[n] = yuv1; \
1485 qdst2[n] = yuv2;
1487 int i;
1488 uint64_t *qdst = (uint64_t *) dst;
1489 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1490 const uint32_t *yc = (uint32_t *) ysrc;
1491 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1492 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1493 for (i = 0; i < chromWidth; i += 8) {
1494 uint64_t y1, y2, yuv1, yuv2;
1495 uint64_t u, v;
1496 /* Prefetch */
1497 __asm__("ldq $31,64(%0)" :: "r"(yc));
1498 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1499 __asm__("ldq $31,64(%0)" :: "r"(uc));
1500 __asm__("ldq $31,64(%0)" :: "r"(vc));
1502 pl2yuy2(0);
1503 pl2yuy2(1);
1504 pl2yuy2(2);
1505 pl2yuy2(3);
1507 yc += 4;
1508 yc2 += 4;
1509 uc += 4;
1510 vc += 4;
1511 qdst += 4;
1512 qdst2 += 4;
1514 y++;
1515 ysrc += lumStride;
1516 dst += dstStride;
1518 #elif HAVE_FAST_64BIT
1519 int i;
1520 uint64_t *ldst = (uint64_t *) dst;
1521 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1522 for (i = 0; i < chromWidth; i += 2) {
1523 uint64_t k, l;
1524 k = yc[0] + (uc[0] << 8) +
1525 (yc[1] << 16) + (vc[0] << 24);
1526 l = yc[2] + (uc[1] << 8) +
1527 (yc[3] << 16) + (vc[1] << 24);
1528 *ldst++ = k + (l << 32);
1529 yc += 4;
1530 uc += 2;
1531 vc += 2;
1534 #else
1535 int i, *idst = (int32_t *) dst;
1536 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1537 for (i = 0; i < chromWidth; i++) {
1538 #if HAVE_BIGENDIAN
1539 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1540 (yc[1] << 8) + (vc[0] << 0);
1541 #else
1542 *idst++ = yc[0] + (uc[0] << 8) +
1543 (yc[1] << 16) + (vc[0] << 24);
1544 #endif
1545 yc += 2;
1546 uc++;
1547 vc++;
1549 #endif
1550 #endif
1551 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1552 usrc += chromStride;
1553 vsrc += chromStride;
1555 ysrc += lumStride;
1556 dst += dstStride;
1558 #if HAVE_MMX
1559 __asm__(EMMS" \n\t"
1560 SFENCE" \n\t"
1561 :::"memory");
1562 #endif
1566 * Height should be a multiple of 2 and width should be a multiple of 16.
1567 * (If this is a problem for anyone then tell me, and I will fix it.)
1569 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1570 long width, long height,
1571 long lumStride, long chromStride, long dstStride)
1573 //FIXME interpolate chroma
1574 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1577 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1578 long width, long height,
1579 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1581 long y;
1582 const x86_reg chromWidth= width>>1;
1583 for (y=0; y<height; y++) {
1584 #if HAVE_MMX
1585 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1586 __asm__ volatile(
1587 "xor %%"REG_a", %%"REG_a" \n\t"
1588 ASMALIGN(4)
1589 "1: \n\t"
1590 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1591 PREFETCH" 32(%2, %%"REG_a") \n\t"
1592 PREFETCH" 32(%3, %%"REG_a") \n\t"
1593 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1594 "movq %%mm0, %%mm2 \n\t" // U(0)
1595 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1596 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1597 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1599 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1600 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1601 "movq %%mm0, %%mm4 \n\t" // Y(0)
1602 "movq %%mm2, %%mm6 \n\t" // Y(8)
1603 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1604 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1605 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1606 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1608 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1609 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1610 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1611 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1613 "add $8, %%"REG_a" \n\t"
1614 "cmp %4, %%"REG_a" \n\t"
1615 " jb 1b \n\t"
1616 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1617 : "%"REG_a
1619 #else
1620 //FIXME adapt the Alpha ASM code from yv12->yuy2
1622 #if HAVE_FAST_64BIT
1623 int i;
1624 uint64_t *ldst = (uint64_t *) dst;
1625 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626 for (i = 0; i < chromWidth; i += 2) {
1627 uint64_t k, l;
1628 k = uc[0] + (yc[0] << 8) +
1629 (vc[0] << 16) + (yc[1] << 24);
1630 l = uc[1] + (yc[2] << 8) +
1631 (vc[1] << 16) + (yc[3] << 24);
1632 *ldst++ = k + (l << 32);
1633 yc += 4;
1634 uc += 2;
1635 vc += 2;
1638 #else
1639 int i, *idst = (int32_t *) dst;
1640 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641 for (i = 0; i < chromWidth; i++) {
1642 #if HAVE_BIGENDIAN
1643 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644 (vc[0] << 8) + (yc[1] << 0);
1645 #else
1646 *idst++ = uc[0] + (yc[0] << 8) +
1647 (vc[0] << 16) + (yc[1] << 24);
1648 #endif
1649 yc += 2;
1650 uc++;
1651 vc++;
1653 #endif
1654 #endif
1655 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1656 usrc += chromStride;
1657 vsrc += chromStride;
1659 ysrc += lumStride;
1660 dst += dstStride;
1662 #if HAVE_MMX
1663 __asm__(EMMS" \n\t"
1664 SFENCE" \n\t"
1665 :::"memory");
1666 #endif
1670 * Height should be a multiple of 2 and width should be a multiple of 16
1671 * (If this is a problem for anyone then tell me, and I will fix it.)
1673 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1674 long width, long height,
1675 long lumStride, long chromStride, long dstStride)
1677 //FIXME interpolate chroma
1678 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1682 * Width should be a multiple of 16.
1684 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1685 long width, long height,
1686 long lumStride, long chromStride, long dstStride)
1688 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1692 * Width should be a multiple of 16.
1694 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1695 long width, long height,
1696 long lumStride, long chromStride, long dstStride)
1698 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1702 * Height should be a multiple of 2 and width should be a multiple of 16.
1703 * (If this is a problem for anyone then tell me, and I will fix it.)
1705 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1706 long width, long height,
1707 long lumStride, long chromStride, long srcStride)
1709 long y;
1710 const x86_reg chromWidth= width>>1;
1711 for (y=0; y<height; y+=2) {
1712 #if HAVE_MMX
1713 __asm__ volatile(
1714 "xor %%"REG_a", %%"REG_a" \n\t"
1715 "pcmpeqw %%mm7, %%mm7 \n\t"
1716 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1717 ASMALIGN(4)
1718 "1: \n\t"
1719 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1720 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1721 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1722 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1723 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1724 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1725 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1726 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1727 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1728 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1729 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1731 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1733 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1734 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1735 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1736 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1737 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1738 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1739 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1740 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1741 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1742 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1744 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1746 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1747 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1748 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1749 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1750 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1751 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1752 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1753 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1755 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1756 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1758 "add $8, %%"REG_a" \n\t"
1759 "cmp %4, %%"REG_a" \n\t"
1760 " jb 1b \n\t"
1761 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1762 : "memory", "%"REG_a
1765 ydst += lumStride;
1766 src += srcStride;
1768 __asm__ volatile(
1769 "xor %%"REG_a", %%"REG_a" \n\t"
1770 ASMALIGN(4)
1771 "1: \n\t"
1772 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1773 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1774 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1775 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1776 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1777 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1778 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1779 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1780 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1781 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1782 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1784 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1785 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1787 "add $8, %%"REG_a" \n\t"
1788 "cmp %4, %%"REG_a" \n\t"
1789 " jb 1b \n\t"
1791 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1792 : "memory", "%"REG_a
1794 #else
1795 long i;
1796 for (i=0; i<chromWidth; i++) {
1797 ydst[2*i+0] = src[4*i+0];
1798 udst[i] = src[4*i+1];
1799 ydst[2*i+1] = src[4*i+2];
1800 vdst[i] = src[4*i+3];
1802 ydst += lumStride;
1803 src += srcStride;
1805 for (i=0; i<chromWidth; i++) {
1806 ydst[2*i+0] = src[4*i+0];
1807 ydst[2*i+1] = src[4*i+2];
1809 #endif
1810 udst += chromStride;
1811 vdst += chromStride;
1812 ydst += lumStride;
1813 src += srcStride;
1815 #if HAVE_MMX
1816 __asm__ volatile(EMMS" \n\t"
1817 SFENCE" \n\t"
1818 :::"memory");
1819 #endif
1822 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1823 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1824 long width, long height, long lumStride, long chromStride)
1826 /* Y Plane */
1827 memcpy(ydst, ysrc, width*height);
1829 /* XXX: implement upscaling for U,V */
1832 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1834 long x,y;
1836 dst[0]= src[0];
1838 // first line
1839 for (x=0; x<srcWidth-1; x++) {
1840 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1841 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1843 dst[2*srcWidth-1]= src[srcWidth-1];
1845 dst+= dstStride;
1847 for (y=1; y<srcHeight; y++) {
1848 #if HAVE_MMX2 || HAVE_AMD3DNOW
1849 const x86_reg mmxSize= srcWidth&~15;
1850 __asm__ volatile(
1851 "mov %4, %%"REG_a" \n\t"
1852 "1: \n\t"
1853 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1854 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1855 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1856 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1857 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1858 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1859 PAVGB" %%mm0, %%mm5 \n\t"
1860 PAVGB" %%mm0, %%mm3 \n\t"
1861 PAVGB" %%mm0, %%mm5 \n\t"
1862 PAVGB" %%mm0, %%mm3 \n\t"
1863 PAVGB" %%mm1, %%mm4 \n\t"
1864 PAVGB" %%mm1, %%mm2 \n\t"
1865 PAVGB" %%mm1, %%mm4 \n\t"
1866 PAVGB" %%mm1, %%mm2 \n\t"
1867 "movq %%mm5, %%mm7 \n\t"
1868 "movq %%mm4, %%mm6 \n\t"
1869 "punpcklbw %%mm3, %%mm5 \n\t"
1870 "punpckhbw %%mm3, %%mm7 \n\t"
1871 "punpcklbw %%mm2, %%mm4 \n\t"
1872 "punpckhbw %%mm2, %%mm6 \n\t"
1873 #if 1
1874 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1875 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1876 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1877 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1878 #else
1879 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1880 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1881 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1882 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1883 #endif
1884 "add $8, %%"REG_a" \n\t"
1885 " js 1b \n\t"
1886 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1887 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1888 "g" (-mmxSize)
1889 : "%"REG_a
1892 #else
1893 const x86_reg mmxSize=1;
1894 #endif
1895 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1896 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1898 for (x=mmxSize-1; x<srcWidth-1; x++) {
1899 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1900 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1901 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1902 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1904 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1905 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1907 dst+=dstStride*2;
1908 src+=srcStride;
1911 // last line
1912 #if 1
1913 dst[0]= src[0];
1915 for (x=0; x<srcWidth-1; x++) {
1916 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1917 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1919 dst[2*srcWidth-1]= src[srcWidth-1];
1920 #else
1921 for (x=0; x<srcWidth; x++) {
1922 dst[2*x+0]=
1923 dst[2*x+1]= src[x];
1925 #endif
1927 #if HAVE_MMX
1928 __asm__ volatile(EMMS" \n\t"
1929 SFENCE" \n\t"
1930 :::"memory");
1931 #endif
1935 * Height should be a multiple of 2 and width should be a multiple of 16.
1936 * (If this is a problem for anyone then tell me, and I will fix it.)
1937 * Chrominance data is only taken from every second line, others are ignored.
1938 * FIXME: Write HQ version.
1940 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1941 long width, long height,
1942 long lumStride, long chromStride, long srcStride)
1944 long y;
1945 const x86_reg chromWidth= width>>1;
1946 for (y=0; y<height; y+=2) {
1947 #if HAVE_MMX
1948 __asm__ volatile(
1949 "xor %%"REG_a", %%"REG_a" \n\t"
1950 "pcmpeqw %%mm7, %%mm7 \n\t"
1951 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1952 ASMALIGN(4)
1953 "1: \n\t"
1954 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1955 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1956 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1957 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1958 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1959 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1960 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1961 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1962 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1963 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1964 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1966 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1968 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1969 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1970 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1971 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1972 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1973 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1974 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1975 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1976 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1977 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1979 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1981 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1982 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1983 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1984 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1985 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1986 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1987 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1988 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1990 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1991 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1993 "add $8, %%"REG_a" \n\t"
1994 "cmp %4, %%"REG_a" \n\t"
1995 " jb 1b \n\t"
1996 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1997 : "memory", "%"REG_a
2000 ydst += lumStride;
2001 src += srcStride;
2003 __asm__ volatile(
2004 "xor %%"REG_a", %%"REG_a" \n\t"
2005 ASMALIGN(4)
2006 "1: \n\t"
2007 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2008 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2009 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2010 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2011 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2012 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2013 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2014 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2016 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2017 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2019 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2020 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2022 "add $8, %%"REG_a" \n\t"
2023 "cmp %4, %%"REG_a" \n\t"
2024 " jb 1b \n\t"
2026 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2027 : "memory", "%"REG_a
2029 #else
2030 long i;
2031 for (i=0; i<chromWidth; i++) {
2032 udst[i] = src[4*i+0];
2033 ydst[2*i+0] = src[4*i+1];
2034 vdst[i] = src[4*i+2];
2035 ydst[2*i+1] = src[4*i+3];
2037 ydst += lumStride;
2038 src += srcStride;
2040 for (i=0; i<chromWidth; i++) {
2041 ydst[2*i+0] = src[4*i+1];
2042 ydst[2*i+1] = src[4*i+3];
2044 #endif
2045 udst += chromStride;
2046 vdst += chromStride;
2047 ydst += lumStride;
2048 src += srcStride;
2050 #if HAVE_MMX
2051 __asm__ volatile(EMMS" \n\t"
2052 SFENCE" \n\t"
2053 :::"memory");
2054 #endif
2058 * Height should be a multiple of 2 and width should be a multiple of 2.
2059 * (If this is a problem for anyone then tell me, and I will fix it.)
2060 * Chrominance data is only taken from every second line,
2061 * others are ignored in the C version.
2062 * FIXME: Write HQ version.
2064 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2065 long width, long height,
2066 long lumStride, long chromStride, long srcStride)
2068 long y;
2069 const x86_reg chromWidth= width>>1;
2070 #if HAVE_MMX
2071 for (y=0; y<height-2; y+=2) {
2072 long i;
2073 for (i=0; i<2; i++) {
2074 __asm__ volatile(
2075 "mov %2, %%"REG_a" \n\t"
2076 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2077 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2078 "pxor %%mm7, %%mm7 \n\t"
2079 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2080 ASMALIGN(4)
2081 "1: \n\t"
2082 PREFETCH" 64(%0, %%"REG_d") \n\t"
2083 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2084 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2085 "punpcklbw %%mm7, %%mm0 \n\t"
2086 "punpcklbw %%mm7, %%mm1 \n\t"
2087 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2088 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2089 "punpcklbw %%mm7, %%mm2 \n\t"
2090 "punpcklbw %%mm7, %%mm3 \n\t"
2091 "pmaddwd %%mm6, %%mm0 \n\t"
2092 "pmaddwd %%mm6, %%mm1 \n\t"
2093 "pmaddwd %%mm6, %%mm2 \n\t"
2094 "pmaddwd %%mm6, %%mm3 \n\t"
2095 #ifndef FAST_BGR2YV12
2096 "psrad $8, %%mm0 \n\t"
2097 "psrad $8, %%mm1 \n\t"
2098 "psrad $8, %%mm2 \n\t"
2099 "psrad $8, %%mm3 \n\t"
2100 #endif
2101 "packssdw %%mm1, %%mm0 \n\t"
2102 "packssdw %%mm3, %%mm2 \n\t"
2103 "pmaddwd %%mm5, %%mm0 \n\t"
2104 "pmaddwd %%mm5, %%mm2 \n\t"
2105 "packssdw %%mm2, %%mm0 \n\t"
2106 "psraw $7, %%mm0 \n\t"
2108 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2109 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2110 "punpcklbw %%mm7, %%mm4 \n\t"
2111 "punpcklbw %%mm7, %%mm1 \n\t"
2112 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2113 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2114 "punpcklbw %%mm7, %%mm2 \n\t"
2115 "punpcklbw %%mm7, %%mm3 \n\t"
2116 "pmaddwd %%mm6, %%mm4 \n\t"
2117 "pmaddwd %%mm6, %%mm1 \n\t"
2118 "pmaddwd %%mm6, %%mm2 \n\t"
2119 "pmaddwd %%mm6, %%mm3 \n\t"
2120 #ifndef FAST_BGR2YV12
2121 "psrad $8, %%mm4 \n\t"
2122 "psrad $8, %%mm1 \n\t"
2123 "psrad $8, %%mm2 \n\t"
2124 "psrad $8, %%mm3 \n\t"
2125 #endif
2126 "packssdw %%mm1, %%mm4 \n\t"
2127 "packssdw %%mm3, %%mm2 \n\t"
2128 "pmaddwd %%mm5, %%mm4 \n\t"
2129 "pmaddwd %%mm5, %%mm2 \n\t"
2130 "add $24, %%"REG_d" \n\t"
2131 "packssdw %%mm2, %%mm4 \n\t"
2132 "psraw $7, %%mm4 \n\t"
2134 "packuswb %%mm4, %%mm0 \n\t"
2135 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2137 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2138 "add $8, %%"REG_a" \n\t"
2139 " js 1b \n\t"
2140 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2141 : "%"REG_a, "%"REG_d
2143 ydst += lumStride;
2144 src += srcStride;
2146 src -= srcStride*2;
2147 __asm__ volatile(
2148 "mov %4, %%"REG_a" \n\t"
2149 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2150 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2151 "pxor %%mm7, %%mm7 \n\t"
2152 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2153 "add %%"REG_d", %%"REG_d" \n\t"
2154 ASMALIGN(4)
2155 "1: \n\t"
2156 PREFETCH" 64(%0, %%"REG_d") \n\t"
2157 PREFETCH" 64(%1, %%"REG_d") \n\t"
2158 #if HAVE_MMX2 || HAVE_AMD3DNOW
2159 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2160 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2161 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2162 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2163 PAVGB" %%mm1, %%mm0 \n\t"
2164 PAVGB" %%mm3, %%mm2 \n\t"
2165 "movq %%mm0, %%mm1 \n\t"
2166 "movq %%mm2, %%mm3 \n\t"
2167 "psrlq $24, %%mm0 \n\t"
2168 "psrlq $24, %%mm2 \n\t"
2169 PAVGB" %%mm1, %%mm0 \n\t"
2170 PAVGB" %%mm3, %%mm2 \n\t"
2171 "punpcklbw %%mm7, %%mm0 \n\t"
2172 "punpcklbw %%mm7, %%mm2 \n\t"
2173 #else
2174 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2175 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2176 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2177 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2178 "punpcklbw %%mm7, %%mm0 \n\t"
2179 "punpcklbw %%mm7, %%mm1 \n\t"
2180 "punpcklbw %%mm7, %%mm2 \n\t"
2181 "punpcklbw %%mm7, %%mm3 \n\t"
2182 "paddw %%mm1, %%mm0 \n\t"
2183 "paddw %%mm3, %%mm2 \n\t"
2184 "paddw %%mm2, %%mm0 \n\t"
2185 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2186 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2187 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2188 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2189 "punpcklbw %%mm7, %%mm4 \n\t"
2190 "punpcklbw %%mm7, %%mm1 \n\t"
2191 "punpcklbw %%mm7, %%mm2 \n\t"
2192 "punpcklbw %%mm7, %%mm3 \n\t"
2193 "paddw %%mm1, %%mm4 \n\t"
2194 "paddw %%mm3, %%mm2 \n\t"
2195 "paddw %%mm4, %%mm2 \n\t"
2196 "psrlw $2, %%mm0 \n\t"
2197 "psrlw $2, %%mm2 \n\t"
2198 #endif
2199 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2200 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2202 "pmaddwd %%mm0, %%mm1 \n\t"
2203 "pmaddwd %%mm2, %%mm3 \n\t"
2204 "pmaddwd %%mm6, %%mm0 \n\t"
2205 "pmaddwd %%mm6, %%mm2 \n\t"
2206 #ifndef FAST_BGR2YV12
2207 "psrad $8, %%mm0 \n\t"
2208 "psrad $8, %%mm1 \n\t"
2209 "psrad $8, %%mm2 \n\t"
2210 "psrad $8, %%mm3 \n\t"
2211 #endif
2212 "packssdw %%mm2, %%mm0 \n\t"
2213 "packssdw %%mm3, %%mm1 \n\t"
2214 "pmaddwd %%mm5, %%mm0 \n\t"
2215 "pmaddwd %%mm5, %%mm1 \n\t"
2216 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2217 "psraw $7, %%mm0 \n\t"
2219 #if HAVE_MMX2 || HAVE_AMD3DNOW
2220 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2221 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2222 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2223 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2224 PAVGB" %%mm1, %%mm4 \n\t"
2225 PAVGB" %%mm3, %%mm2 \n\t"
2226 "movq %%mm4, %%mm1 \n\t"
2227 "movq %%mm2, %%mm3 \n\t"
2228 "psrlq $24, %%mm4 \n\t"
2229 "psrlq $24, %%mm2 \n\t"
2230 PAVGB" %%mm1, %%mm4 \n\t"
2231 PAVGB" %%mm3, %%mm2 \n\t"
2232 "punpcklbw %%mm7, %%mm4 \n\t"
2233 "punpcklbw %%mm7, %%mm2 \n\t"
2234 #else
2235 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2236 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2237 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2238 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2239 "punpcklbw %%mm7, %%mm4 \n\t"
2240 "punpcklbw %%mm7, %%mm1 \n\t"
2241 "punpcklbw %%mm7, %%mm2 \n\t"
2242 "punpcklbw %%mm7, %%mm3 \n\t"
2243 "paddw %%mm1, %%mm4 \n\t"
2244 "paddw %%mm3, %%mm2 \n\t"
2245 "paddw %%mm2, %%mm4 \n\t"
2246 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2247 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2248 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2249 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2250 "punpcklbw %%mm7, %%mm5 \n\t"
2251 "punpcklbw %%mm7, %%mm1 \n\t"
2252 "punpcklbw %%mm7, %%mm2 \n\t"
2253 "punpcklbw %%mm7, %%mm3 \n\t"
2254 "paddw %%mm1, %%mm5 \n\t"
2255 "paddw %%mm3, %%mm2 \n\t"
2256 "paddw %%mm5, %%mm2 \n\t"
2257 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2258 "psrlw $2, %%mm4 \n\t"
2259 "psrlw $2, %%mm2 \n\t"
2260 #endif
2261 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2262 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2264 "pmaddwd %%mm4, %%mm1 \n\t"
2265 "pmaddwd %%mm2, %%mm3 \n\t"
2266 "pmaddwd %%mm6, %%mm4 \n\t"
2267 "pmaddwd %%mm6, %%mm2 \n\t"
2268 #ifndef FAST_BGR2YV12
2269 "psrad $8, %%mm4 \n\t"
2270 "psrad $8, %%mm1 \n\t"
2271 "psrad $8, %%mm2 \n\t"
2272 "psrad $8, %%mm3 \n\t"
2273 #endif
2274 "packssdw %%mm2, %%mm4 \n\t"
2275 "packssdw %%mm3, %%mm1 \n\t"
2276 "pmaddwd %%mm5, %%mm4 \n\t"
2277 "pmaddwd %%mm5, %%mm1 \n\t"
2278 "add $24, %%"REG_d" \n\t"
2279 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2280 "psraw $7, %%mm4 \n\t"
2282 "movq %%mm0, %%mm1 \n\t"
2283 "punpckldq %%mm4, %%mm0 \n\t"
2284 "punpckhdq %%mm4, %%mm1 \n\t"
2285 "packsswb %%mm1, %%mm0 \n\t"
2286 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2287 "movd %%mm0, (%2, %%"REG_a") \n\t"
2288 "punpckhdq %%mm0, %%mm0 \n\t"
2289 "movd %%mm0, (%3, %%"REG_a") \n\t"
2290 "add $4, %%"REG_a" \n\t"
2291 " js 1b \n\t"
2292 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2293 : "%"REG_a, "%"REG_d
2296 udst += chromStride;
2297 vdst += chromStride;
2298 src += srcStride*2;
2301 __asm__ volatile(EMMS" \n\t"
2302 SFENCE" \n\t"
2303 :::"memory");
2304 #else
2305 y=0;
2306 #endif
2307 for (; y<height; y+=2) {
2308 long i;
2309 for (i=0; i<chromWidth; i++) {
2310 unsigned int b = src[6*i+0];
2311 unsigned int g = src[6*i+1];
2312 unsigned int r = src[6*i+2];
2314 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2315 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2316 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2318 udst[i] = U;
2319 vdst[i] = V;
2320 ydst[2*i] = Y;
2322 b = src[6*i+3];
2323 g = src[6*i+4];
2324 r = src[6*i+5];
2326 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2327 ydst[2*i+1] = Y;
2329 ydst += lumStride;
2330 src += srcStride;
2332 for (i=0; i<chromWidth; i++) {
2333 unsigned int b = src[6*i+0];
2334 unsigned int g = src[6*i+1];
2335 unsigned int r = src[6*i+2];
2337 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2339 ydst[2*i] = Y;
2341 b = src[6*i+3];
2342 g = src[6*i+4];
2343 r = src[6*i+5];
2345 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2346 ydst[2*i+1] = Y;
2348 udst += chromStride;
2349 vdst += chromStride;
2350 ydst += lumStride;
2351 src += srcStride;
2355 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2356 long width, long height, long src1Stride,
2357 long src2Stride, long dstStride)
2359 long h;
2361 for (h=0; h < height; h++) {
2362 long w;
2364 #if HAVE_MMX
2365 #if HAVE_SSE2
2366 __asm__(
2367 "xor %%"REG_a", %%"REG_a" \n\t"
2368 "1: \n\t"
2369 PREFETCH" 64(%1, %%"REG_a") \n\t"
2370 PREFETCH" 64(%2, %%"REG_a") \n\t"
2371 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2372 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2373 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2374 "punpcklbw %%xmm2, %%xmm0 \n\t"
2375 "punpckhbw %%xmm2, %%xmm1 \n\t"
2376 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2377 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2378 "add $16, %%"REG_a" \n\t"
2379 "cmp %3, %%"REG_a" \n\t"
2380 " jb 1b \n\t"
2381 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2382 : "memory", "%"REG_a""
2384 #else
2385 __asm__(
2386 "xor %%"REG_a", %%"REG_a" \n\t"
2387 "1: \n\t"
2388 PREFETCH" 64(%1, %%"REG_a") \n\t"
2389 PREFETCH" 64(%2, %%"REG_a") \n\t"
2390 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2391 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2392 "movq %%mm0, %%mm1 \n\t"
2393 "movq %%mm2, %%mm3 \n\t"
2394 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2395 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2396 "punpcklbw %%mm4, %%mm0 \n\t"
2397 "punpckhbw %%mm4, %%mm1 \n\t"
2398 "punpcklbw %%mm5, %%mm2 \n\t"
2399 "punpckhbw %%mm5, %%mm3 \n\t"
2400 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2401 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2402 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2403 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2404 "add $16, %%"REG_a" \n\t"
2405 "cmp %3, %%"REG_a" \n\t"
2406 " jb 1b \n\t"
2407 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2408 : "memory", "%"REG_a
2410 #endif
2411 for (w= (width&(~15)); w < width; w++) {
2412 dest[2*w+0] = src1[w];
2413 dest[2*w+1] = src2[w];
2415 #else
2416 for (w=0; w < width; w++) {
2417 dest[2*w+0] = src1[w];
2418 dest[2*w+1] = src2[w];
2420 #endif
2421 dest += dstStride;
2422 src1 += src1Stride;
2423 src2 += src2Stride;
2425 #if HAVE_MMX
2426 __asm__(
2427 EMMS" \n\t"
2428 SFENCE" \n\t"
2429 ::: "memory"
2431 #endif
2434 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2435 uint8_t *dst1, uint8_t *dst2,
2436 long width, long height,
2437 long srcStride1, long srcStride2,
2438 long dstStride1, long dstStride2)
2440 x86_reg y;
2441 long x,w,h;
2442 w=width/2; h=height/2;
2443 #if HAVE_MMX
2444 __asm__ volatile(
2445 PREFETCH" %0 \n\t"
2446 PREFETCH" %1 \n\t"
2447 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2448 #endif
2449 for (y=0;y<h;y++) {
2450 const uint8_t* s1=src1+srcStride1*(y>>1);
2451 uint8_t* d=dst1+dstStride1*y;
2452 x=0;
2453 #if HAVE_MMX
2454 for (;x<w-31;x+=32) {
2455 __asm__ volatile(
2456 PREFETCH" 32%1 \n\t"
2457 "movq %1, %%mm0 \n\t"
2458 "movq 8%1, %%mm2 \n\t"
2459 "movq 16%1, %%mm4 \n\t"
2460 "movq 24%1, %%mm6 \n\t"
2461 "movq %%mm0, %%mm1 \n\t"
2462 "movq %%mm2, %%mm3 \n\t"
2463 "movq %%mm4, %%mm5 \n\t"
2464 "movq %%mm6, %%mm7 \n\t"
2465 "punpcklbw %%mm0, %%mm0 \n\t"
2466 "punpckhbw %%mm1, %%mm1 \n\t"
2467 "punpcklbw %%mm2, %%mm2 \n\t"
2468 "punpckhbw %%mm3, %%mm3 \n\t"
2469 "punpcklbw %%mm4, %%mm4 \n\t"
2470 "punpckhbw %%mm5, %%mm5 \n\t"
2471 "punpcklbw %%mm6, %%mm6 \n\t"
2472 "punpckhbw %%mm7, %%mm7 \n\t"
2473 MOVNTQ" %%mm0, %0 \n\t"
2474 MOVNTQ" %%mm1, 8%0 \n\t"
2475 MOVNTQ" %%mm2, 16%0 \n\t"
2476 MOVNTQ" %%mm3, 24%0 \n\t"
2477 MOVNTQ" %%mm4, 32%0 \n\t"
2478 MOVNTQ" %%mm5, 40%0 \n\t"
2479 MOVNTQ" %%mm6, 48%0 \n\t"
2480 MOVNTQ" %%mm7, 56%0"
2481 :"=m"(d[2*x])
2482 :"m"(s1[x])
2483 :"memory");
2485 #endif
2486 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2488 for (y=0;y<h;y++) {
2489 const uint8_t* s2=src2+srcStride2*(y>>1);
2490 uint8_t* d=dst2+dstStride2*y;
2491 x=0;
2492 #if HAVE_MMX
2493 for (;x<w-31;x+=32) {
2494 __asm__ volatile(
2495 PREFETCH" 32%1 \n\t"
2496 "movq %1, %%mm0 \n\t"
2497 "movq 8%1, %%mm2 \n\t"
2498 "movq 16%1, %%mm4 \n\t"
2499 "movq 24%1, %%mm6 \n\t"
2500 "movq %%mm0, %%mm1 \n\t"
2501 "movq %%mm2, %%mm3 \n\t"
2502 "movq %%mm4, %%mm5 \n\t"
2503 "movq %%mm6, %%mm7 \n\t"
2504 "punpcklbw %%mm0, %%mm0 \n\t"
2505 "punpckhbw %%mm1, %%mm1 \n\t"
2506 "punpcklbw %%mm2, %%mm2 \n\t"
2507 "punpckhbw %%mm3, %%mm3 \n\t"
2508 "punpcklbw %%mm4, %%mm4 \n\t"
2509 "punpckhbw %%mm5, %%mm5 \n\t"
2510 "punpcklbw %%mm6, %%mm6 \n\t"
2511 "punpckhbw %%mm7, %%mm7 \n\t"
2512 MOVNTQ" %%mm0, %0 \n\t"
2513 MOVNTQ" %%mm1, 8%0 \n\t"
2514 MOVNTQ" %%mm2, 16%0 \n\t"
2515 MOVNTQ" %%mm3, 24%0 \n\t"
2516 MOVNTQ" %%mm4, 32%0 \n\t"
2517 MOVNTQ" %%mm5, 40%0 \n\t"
2518 MOVNTQ" %%mm6, 48%0 \n\t"
2519 MOVNTQ" %%mm7, 56%0"
2520 :"=m"(d[2*x])
2521 :"m"(s2[x])
2522 :"memory");
2524 #endif
2525 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2527 #if HAVE_MMX
2528 __asm__(
2529 EMMS" \n\t"
2530 SFENCE" \n\t"
2531 ::: "memory"
2533 #endif
2536 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2537 uint8_t *dst,
2538 long width, long height,
2539 long srcStride1, long srcStride2,
2540 long srcStride3, long dstStride)
2542 x86_reg x;
2543 long y,w,h;
2544 w=width/2; h=height;
2545 for (y=0;y<h;y++) {
2546 const uint8_t* yp=src1+srcStride1*y;
2547 const uint8_t* up=src2+srcStride2*(y>>2);
2548 const uint8_t* vp=src3+srcStride3*(y>>2);
2549 uint8_t* d=dst+dstStride*y;
2550 x=0;
2551 #if HAVE_MMX
2552 for (;x<w-7;x+=8) {
2553 __asm__ volatile(
2554 PREFETCH" 32(%1, %0) \n\t"
2555 PREFETCH" 32(%2, %0) \n\t"
2556 PREFETCH" 32(%3, %0) \n\t"
2557 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2558 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2559 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2560 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2561 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2562 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2563 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2564 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2565 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2566 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2568 "movq %%mm1, %%mm6 \n\t"
2569 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2570 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2571 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2572 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2573 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2575 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2576 "movq 8(%1, %0, 4), %%mm0 \n\t"
2577 "movq %%mm0, %%mm3 \n\t"
2578 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2579 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2580 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2581 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2583 "movq %%mm4, %%mm6 \n\t"
2584 "movq 16(%1, %0, 4), %%mm0 \n\t"
2585 "movq %%mm0, %%mm3 \n\t"
2586 "punpcklbw %%mm5, %%mm4 \n\t"
2587 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2588 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2589 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2590 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2592 "punpckhbw %%mm5, %%mm6 \n\t"
2593 "movq 24(%1, %0, 4), %%mm0 \n\t"
2594 "movq %%mm0, %%mm3 \n\t"
2595 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2596 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2597 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2598 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2600 : "+r" (x)
2601 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2602 :"memory");
2604 #endif
2605 for (; x<w; x++) {
2606 const long x2 = x<<2;
2607 d[8*x+0] = yp[x2];
2608 d[8*x+1] = up[x];
2609 d[8*x+2] = yp[x2+1];
2610 d[8*x+3] = vp[x];
2611 d[8*x+4] = yp[x2+2];
2612 d[8*x+5] = up[x];
2613 d[8*x+6] = yp[x2+3];
2614 d[8*x+7] = vp[x];
2617 #if HAVE_MMX
2618 __asm__(
2619 EMMS" \n\t"
2620 SFENCE" \n\t"
2621 ::: "memory"
2623 #endif
2626 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2628 dst += count;
2629 src += 2*count;
2630 count= - count;
2632 #if HAVE_MMX
2633 if(count <= -16) {
2634 count += 15;
2635 __asm__ volatile(
2636 "pcmpeqw %%mm7, %%mm7 \n\t"
2637 "psrlw $8, %%mm7 \n\t"
2638 "1: \n\t"
2639 "movq -30(%1, %0, 2), %%mm0 \n\t"
2640 "movq -22(%1, %0, 2), %%mm1 \n\t"
2641 "movq -14(%1, %0, 2), %%mm2 \n\t"
2642 "movq -6(%1, %0, 2), %%mm3 \n\t"
2643 "pand %%mm7, %%mm0 \n\t"
2644 "pand %%mm7, %%mm1 \n\t"
2645 "pand %%mm7, %%mm2 \n\t"
2646 "pand %%mm7, %%mm3 \n\t"
2647 "packuswb %%mm1, %%mm0 \n\t"
2648 "packuswb %%mm3, %%mm2 \n\t"
2649 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2650 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2651 "add $16, %0 \n\t"
2652 " js 1b \n\t"
2653 : "+r"(count)
2654 : "r"(src), "r"(dst)
2656 count -= 15;
2658 #endif
2659 while(count<0) {
2660 dst[count]= src[2*count];
2661 count++;
2665 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2667 dst0+= count;
2668 dst1+= count;
2669 src += 4*count;
2670 count= - count;
2671 #if HAVE_MMX
2672 if(count <= -8) {
2673 count += 7;
2674 __asm__ volatile(
2675 "pcmpeqw %%mm7, %%mm7 \n\t"
2676 "psrlw $8, %%mm7 \n\t"
2677 "1: \n\t"
2678 "movq -28(%1, %0, 4), %%mm0 \n\t"
2679 "movq -20(%1, %0, 4), %%mm1 \n\t"
2680 "movq -12(%1, %0, 4), %%mm2 \n\t"
2681 "movq -4(%1, %0, 4), %%mm3 \n\t"
2682 "pand %%mm7, %%mm0 \n\t"
2683 "pand %%mm7, %%mm1 \n\t"
2684 "pand %%mm7, %%mm2 \n\t"
2685 "pand %%mm7, %%mm3 \n\t"
2686 "packuswb %%mm1, %%mm0 \n\t"
2687 "packuswb %%mm3, %%mm2 \n\t"
2688 "movq %%mm0, %%mm1 \n\t"
2689 "movq %%mm2, %%mm3 \n\t"
2690 "psrlw $8, %%mm0 \n\t"
2691 "psrlw $8, %%mm2 \n\t"
2692 "pand %%mm7, %%mm1 \n\t"
2693 "pand %%mm7, %%mm3 \n\t"
2694 "packuswb %%mm2, %%mm0 \n\t"
2695 "packuswb %%mm3, %%mm1 \n\t"
2696 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2697 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2698 "add $8, %0 \n\t"
2699 " js 1b \n\t"
2700 : "+r"(count)
2701 : "r"(src), "r"(dst0), "r"(dst1)
2703 count -= 7;
2705 #endif
2706 while(count<0) {
2707 dst0[count]= src[4*count+0];
2708 dst1[count]= src[4*count+2];
2709 count++;
2713 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2715 dst0 += count;
2716 dst1 += count;
2717 src0 += 4*count;
2718 src1 += 4*count;
2719 count= - count;
2720 #ifdef PAVGB
2721 if(count <= -8) {
2722 count += 7;
2723 __asm__ volatile(
2724 "pcmpeqw %%mm7, %%mm7 \n\t"
2725 "psrlw $8, %%mm7 \n\t"
2726 "1: \n\t"
2727 "movq -28(%1, %0, 4), %%mm0 \n\t"
2728 "movq -20(%1, %0, 4), %%mm1 \n\t"
2729 "movq -12(%1, %0, 4), %%mm2 \n\t"
2730 "movq -4(%1, %0, 4), %%mm3 \n\t"
2731 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2732 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2733 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2734 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2735 "pand %%mm7, %%mm0 \n\t"
2736 "pand %%mm7, %%mm1 \n\t"
2737 "pand %%mm7, %%mm2 \n\t"
2738 "pand %%mm7, %%mm3 \n\t"
2739 "packuswb %%mm1, %%mm0 \n\t"
2740 "packuswb %%mm3, %%mm2 \n\t"
2741 "movq %%mm0, %%mm1 \n\t"
2742 "movq %%mm2, %%mm3 \n\t"
2743 "psrlw $8, %%mm0 \n\t"
2744 "psrlw $8, %%mm2 \n\t"
2745 "pand %%mm7, %%mm1 \n\t"
2746 "pand %%mm7, %%mm3 \n\t"
2747 "packuswb %%mm2, %%mm0 \n\t"
2748 "packuswb %%mm3, %%mm1 \n\t"
2749 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2750 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2751 "add $8, %0 \n\t"
2752 " js 1b \n\t"
2753 : "+r"(count)
2754 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2756 count -= 7;
2758 #endif
2759 while(count<0) {
2760 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2761 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2762 count++;
2766 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2768 dst0+= count;
2769 dst1+= count;
2770 src += 4*count;
2771 count= - count;
2772 #if HAVE_MMX
2773 if(count <= -8) {
2774 count += 7;
2775 __asm__ volatile(
2776 "pcmpeqw %%mm7, %%mm7 \n\t"
2777 "psrlw $8, %%mm7 \n\t"
2778 "1: \n\t"
2779 "movq -28(%1, %0, 4), %%mm0 \n\t"
2780 "movq -20(%1, %0, 4), %%mm1 \n\t"
2781 "movq -12(%1, %0, 4), %%mm2 \n\t"
2782 "movq -4(%1, %0, 4), %%mm3 \n\t"
2783 "psrlw $8, %%mm0 \n\t"
2784 "psrlw $8, %%mm1 \n\t"
2785 "psrlw $8, %%mm2 \n\t"
2786 "psrlw $8, %%mm3 \n\t"
2787 "packuswb %%mm1, %%mm0 \n\t"
2788 "packuswb %%mm3, %%mm2 \n\t"
2789 "movq %%mm0, %%mm1 \n\t"
2790 "movq %%mm2, %%mm3 \n\t"
2791 "psrlw $8, %%mm0 \n\t"
2792 "psrlw $8, %%mm2 \n\t"
2793 "pand %%mm7, %%mm1 \n\t"
2794 "pand %%mm7, %%mm3 \n\t"
2795 "packuswb %%mm2, %%mm0 \n\t"
2796 "packuswb %%mm3, %%mm1 \n\t"
2797 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2798 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2799 "add $8, %0 \n\t"
2800 " js 1b \n\t"
2801 : "+r"(count)
2802 : "r"(src), "r"(dst0), "r"(dst1)
2804 count -= 7;
2806 #endif
2807 src++;
2808 while(count<0) {
2809 dst0[count]= src[4*count+0];
2810 dst1[count]= src[4*count+2];
2811 count++;
2815 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2817 dst0 += count;
2818 dst1 += count;
2819 src0 += 4*count;
2820 src1 += 4*count;
2821 count= - count;
2822 #ifdef PAVGB
2823 if(count <= -8) {
2824 count += 7;
2825 __asm__ volatile(
2826 "pcmpeqw %%mm7, %%mm7 \n\t"
2827 "psrlw $8, %%mm7 \n\t"
2828 "1: \n\t"
2829 "movq -28(%1, %0, 4), %%mm0 \n\t"
2830 "movq -20(%1, %0, 4), %%mm1 \n\t"
2831 "movq -12(%1, %0, 4), %%mm2 \n\t"
2832 "movq -4(%1, %0, 4), %%mm3 \n\t"
2833 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2834 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2835 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2836 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2837 "psrlw $8, %%mm0 \n\t"
2838 "psrlw $8, %%mm1 \n\t"
2839 "psrlw $8, %%mm2 \n\t"
2840 "psrlw $8, %%mm3 \n\t"
2841 "packuswb %%mm1, %%mm0 \n\t"
2842 "packuswb %%mm3, %%mm2 \n\t"
2843 "movq %%mm0, %%mm1 \n\t"
2844 "movq %%mm2, %%mm3 \n\t"
2845 "psrlw $8, %%mm0 \n\t"
2846 "psrlw $8, %%mm2 \n\t"
2847 "pand %%mm7, %%mm1 \n\t"
2848 "pand %%mm7, %%mm3 \n\t"
2849 "packuswb %%mm2, %%mm0 \n\t"
2850 "packuswb %%mm3, %%mm1 \n\t"
2851 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2852 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2853 "add $8, %0 \n\t"
2854 " js 1b \n\t"
2855 : "+r"(count)
2856 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2858 count -= 7;
2860 #endif
2861 src0++;
2862 src1++;
2863 while(count<0) {
2864 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2865 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2866 count++;
2870 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2871 long width, long height,
2872 long lumStride, long chromStride, long srcStride)
2874 long y;
2875 const long chromWidth= -((-width)>>1);
2877 for (y=0; y<height; y++) {
2878 RENAME(extract_even)(src, ydst, width);
2879 if(y&1) {
2880 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2881 udst+= chromStride;
2882 vdst+= chromStride;
2885 src += srcStride;
2886 ydst+= lumStride;
2888 #if HAVE_MMX
2889 __asm__(
2890 EMMS" \n\t"
2891 SFENCE" \n\t"
2892 ::: "memory"
2894 #endif
2897 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2898 long width, long height,
2899 long lumStride, long chromStride, long srcStride)
2901 long y;
2902 const long chromWidth= -((-width)>>1);
2904 for (y=0; y<height; y++) {
2905 RENAME(extract_even)(src, ydst, width);
2906 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2908 src += srcStride;
2909 ydst+= lumStride;
2910 udst+= chromStride;
2911 vdst+= chromStride;
2913 #if HAVE_MMX
2914 __asm__(
2915 EMMS" \n\t"
2916 SFENCE" \n\t"
2917 ::: "memory"
2919 #endif
2922 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2923 long width, long height,
2924 long lumStride, long chromStride, long srcStride)
2926 long y;
2927 const long chromWidth= -((-width)>>1);
2929 for (y=0; y<height; y++) {
2930 RENAME(extract_even)(src+1, ydst, width);
2931 if(y&1) {
2932 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2933 udst+= chromStride;
2934 vdst+= chromStride;
2937 src += srcStride;
2938 ydst+= lumStride;
2940 #if HAVE_MMX
2941 __asm__(
2942 EMMS" \n\t"
2943 SFENCE" \n\t"
2944 ::: "memory"
2946 #endif
2949 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2950 long width, long height,
2951 long lumStride, long chromStride, long srcStride)
2953 long y;
2954 const long chromWidth= -((-width)>>1);
2956 for (y=0; y<height; y++) {
2957 RENAME(extract_even)(src+1, ydst, width);
2958 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2960 src += srcStride;
2961 ydst+= lumStride;
2962 udst+= chromStride;
2963 vdst+= chromStride;
2965 #if HAVE_MMX
2966 __asm__(
2967 EMMS" \n\t"
2968 SFENCE" \n\t"
2969 ::: "memory"
2971 #endif
2974 static inline void RENAME(rgb2rgb_init)(void)
2976 rgb15to16 = RENAME(rgb15to16);
2977 rgb15tobgr24 = RENAME(rgb15tobgr24);
2978 rgb15to32 = RENAME(rgb15to32);
2979 rgb16tobgr24 = RENAME(rgb16tobgr24);
2980 rgb16to32 = RENAME(rgb16to32);
2981 rgb16to15 = RENAME(rgb16to15);
2982 rgb24tobgr16 = RENAME(rgb24tobgr16);
2983 rgb24tobgr15 = RENAME(rgb24tobgr15);
2984 rgb24tobgr32 = RENAME(rgb24tobgr32);
2985 rgb32to16 = RENAME(rgb32to16);
2986 rgb32to15 = RENAME(rgb32to15);
2987 rgb32tobgr24 = RENAME(rgb32tobgr24);
2988 rgb24to15 = RENAME(rgb24to15);
2989 rgb24to16 = RENAME(rgb24to16);
2990 rgb24tobgr24 = RENAME(rgb24tobgr24);
2991 rgb32tobgr32 = RENAME(rgb32tobgr32);
2992 rgb32tobgr16 = RENAME(rgb32tobgr16);
2993 rgb32tobgr15 = RENAME(rgb32tobgr15);
2994 yv12toyuy2 = RENAME(yv12toyuy2);
2995 yv12touyvy = RENAME(yv12touyvy);
2996 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2997 yuv422ptouyvy = RENAME(yuv422ptouyvy);
2998 yuy2toyv12 = RENAME(yuy2toyv12);
2999 // yvu9toyv12 = RENAME(yvu9toyv12);
3000 planar2x = RENAME(planar2x);
3001 rgb24toyv12 = RENAME(rgb24toyv12);
3002 interleaveBytes = RENAME(interleaveBytes);
3003 vu9_to_vu12 = RENAME(vu9_to_vu12);
3004 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
3006 uyvytoyuv420 = RENAME(uyvytoyuv420);
3007 uyvytoyuv422 = RENAME(uyvytoyuv422);
3008 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
3009 yuyvtoyuv422 = RENAME(yuyvtoyuv422);