Cosmetics:
[mplayer/glamo.git] / libswscale / rgb2rgb_template.c
blob8b5a504da65238dba5766366e12571420cec3237
1 /*
2 * software RGB to RGB converter
3 * pluralize by software PAL8 to RGB converter
4 * software YUV to YUV converter
5 * software YUV to RGB converter
6 * Written by Nick Kurshev.
7 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8 * lot of big-endian byte order fixes by Alex Beregszaszi
10 * This file is part of FFmpeg.
12 * FFmpeg is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * FFmpeg is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with FFmpeg; if not, write to the Free Software
24 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * The C code (not assembly, MMX, ...) of this file can be used
27 * under the LGPL license.
30 #include <stddef.h>
32 #undef PREFETCH
33 #undef MOVNTQ
34 #undef EMMS
35 #undef SFENCE
36 #undef MMREG_SIZE
37 #undef PREFETCHW
38 #undef PAVGB
40 #if HAVE_SSE2
41 #define MMREG_SIZE 16
42 #else
43 #define MMREG_SIZE 8
44 #endif
46 #if HAVE_AMD3DNOW
47 #define PREFETCH "prefetch"
48 #define PREFETCHW "prefetchw"
49 #define PAVGB "pavgusb"
50 #elif HAVE_MMX2
51 #define PREFETCH "prefetchnta"
52 #define PREFETCHW "prefetcht0"
53 #define PAVGB "pavgb"
54 #else
55 #define PREFETCH " # nop"
56 #define PREFETCHW " # nop"
57 #endif
59 #if HAVE_AMD3DNOW
60 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
61 #define EMMS "femms"
62 #else
63 #define EMMS "emms"
64 #endif
66 #if HAVE_MMX2
67 #define MOVNTQ "movntq"
68 #define SFENCE "sfence"
69 #else
70 #define MOVNTQ "movq"
71 #define SFENCE " # nop"
72 #endif
74 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
76 uint8_t *dest = dst;
77 const uint8_t *s = src;
78 const uint8_t *end;
79 #if HAVE_MMX
80 const uint8_t *mm_end;
81 #endif
82 end = s + src_size;
83 #if HAVE_MMX
84 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
85 mm_end = end - 23;
86 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
87 while (s < mm_end) {
88 __asm__ volatile(
89 PREFETCH" 32%1 \n\t"
90 "movd %1, %%mm0 \n\t"
91 "punpckldq 3%1, %%mm0 \n\t"
92 "movd 6%1, %%mm1 \n\t"
93 "punpckldq 9%1, %%mm1 \n\t"
94 "movd 12%1, %%mm2 \n\t"
95 "punpckldq 15%1, %%mm2 \n\t"
96 "movd 18%1, %%mm3 \n\t"
97 "punpckldq 21%1, %%mm3 \n\t"
98 "por %%mm7, %%mm0 \n\t"
99 "por %%mm7, %%mm1 \n\t"
100 "por %%mm7, %%mm2 \n\t"
101 "por %%mm7, %%mm3 \n\t"
102 MOVNTQ" %%mm0, %0 \n\t"
103 MOVNTQ" %%mm1, 8%0 \n\t"
104 MOVNTQ" %%mm2, 16%0 \n\t"
105 MOVNTQ" %%mm3, 24%0"
106 :"=m"(*dest)
107 :"m"(*s)
108 :"memory");
109 dest += 32;
110 s += 24;
112 __asm__ volatile(SFENCE:::"memory");
113 __asm__ volatile(EMMS:::"memory");
114 #endif
115 while (s < end) {
116 #if HAVE_BIGENDIAN
117 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
118 *dest++ = 255;
119 *dest++ = s[2];
120 *dest++ = s[1];
121 *dest++ = s[0];
122 s+=3;
123 #else
124 *dest++ = *s++;
125 *dest++ = *s++;
126 *dest++ = *s++;
127 *dest++ = 255;
128 #endif
132 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
134 uint8_t *dest = dst;
135 const uint8_t *s = src;
136 const uint8_t *end;
137 #if HAVE_MMX
138 const uint8_t *mm_end;
139 #endif
140 end = s + src_size;
141 #if HAVE_MMX
142 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
143 mm_end = end - 31;
144 while (s < mm_end) {
145 __asm__ volatile(
146 PREFETCH" 32%1 \n\t"
147 "movq %1, %%mm0 \n\t"
148 "movq 8%1, %%mm1 \n\t"
149 "movq 16%1, %%mm4 \n\t"
150 "movq 24%1, %%mm5 \n\t"
151 "movq %%mm0, %%mm2 \n\t"
152 "movq %%mm1, %%mm3 \n\t"
153 "movq %%mm4, %%mm6 \n\t"
154 "movq %%mm5, %%mm7 \n\t"
155 "psrlq $8, %%mm2 \n\t"
156 "psrlq $8, %%mm3 \n\t"
157 "psrlq $8, %%mm6 \n\t"
158 "psrlq $8, %%mm7 \n\t"
159 "pand %2, %%mm0 \n\t"
160 "pand %2, %%mm1 \n\t"
161 "pand %2, %%mm4 \n\t"
162 "pand %2, %%mm5 \n\t"
163 "pand %3, %%mm2 \n\t"
164 "pand %3, %%mm3 \n\t"
165 "pand %3, %%mm6 \n\t"
166 "pand %3, %%mm7 \n\t"
167 "por %%mm2, %%mm0 \n\t"
168 "por %%mm3, %%mm1 \n\t"
169 "por %%mm6, %%mm4 \n\t"
170 "por %%mm7, %%mm5 \n\t"
172 "movq %%mm1, %%mm2 \n\t"
173 "movq %%mm4, %%mm3 \n\t"
174 "psllq $48, %%mm2 \n\t"
175 "psllq $32, %%mm3 \n\t"
176 "pand %4, %%mm2 \n\t"
177 "pand %5, %%mm3 \n\t"
178 "por %%mm2, %%mm0 \n\t"
179 "psrlq $16, %%mm1 \n\t"
180 "psrlq $32, %%mm4 \n\t"
181 "psllq $16, %%mm5 \n\t"
182 "por %%mm3, %%mm1 \n\t"
183 "pand %6, %%mm5 \n\t"
184 "por %%mm5, %%mm4 \n\t"
186 MOVNTQ" %%mm0, %0 \n\t"
187 MOVNTQ" %%mm1, 8%0 \n\t"
188 MOVNTQ" %%mm4, 16%0"
189 :"=m"(*dest)
190 :"m"(*s),"m"(mask24l),
191 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
192 :"memory");
193 dest += 24;
194 s += 32;
196 __asm__ volatile(SFENCE:::"memory");
197 __asm__ volatile(EMMS:::"memory");
198 #endif
199 while (s < end) {
200 #if HAVE_BIGENDIAN
201 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
202 s++;
203 dest[2] = *s++;
204 dest[1] = *s++;
205 dest[0] = *s++;
206 dest += 3;
207 #else
208 *dest++ = *s++;
209 *dest++ = *s++;
210 *dest++ = *s++;
211 s++;
212 #endif
217 original by Strepto/Astral
218 ported to gcc & bugfixed: A'rpi
219 MMX2, 3DNOW optimization by Nick Kurshev
220 32-bit C version, and and&add trick by Michael Niedermayer
222 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, long src_size)
224 register const uint8_t* s=src;
225 register uint8_t* d=dst;
226 register const uint8_t *end;
227 const uint8_t *mm_end;
228 end = s + src_size;
229 #if HAVE_MMX
230 __asm__ volatile(PREFETCH" %0"::"m"(*s));
231 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
232 mm_end = end - 15;
233 while (s<mm_end) {
234 __asm__ volatile(
235 PREFETCH" 32%1 \n\t"
236 "movq %1, %%mm0 \n\t"
237 "movq 8%1, %%mm2 \n\t"
238 "movq %%mm0, %%mm1 \n\t"
239 "movq %%mm2, %%mm3 \n\t"
240 "pand %%mm4, %%mm0 \n\t"
241 "pand %%mm4, %%mm2 \n\t"
242 "paddw %%mm1, %%mm0 \n\t"
243 "paddw %%mm3, %%mm2 \n\t"
244 MOVNTQ" %%mm0, %0 \n\t"
245 MOVNTQ" %%mm2, 8%0"
246 :"=m"(*d)
247 :"m"(*s)
249 d+=16;
250 s+=16;
252 __asm__ volatile(SFENCE:::"memory");
253 __asm__ volatile(EMMS:::"memory");
254 #endif
255 mm_end = end - 3;
256 while (s < mm_end) {
257 register unsigned x= *((const uint32_t *)s);
258 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
259 d+=4;
260 s+=4;
262 if (s < end) {
263 register unsigned short x= *((const uint16_t *)s);
264 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
268 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, long src_size)
270 register const uint8_t* s=src;
271 register uint8_t* d=dst;
272 register const uint8_t *end;
273 const uint8_t *mm_end;
274 end = s + src_size;
275 #if HAVE_MMX
276 __asm__ volatile(PREFETCH" %0"::"m"(*s));
277 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
278 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
279 mm_end = end - 15;
280 while (s<mm_end) {
281 __asm__ volatile(
282 PREFETCH" 32%1 \n\t"
283 "movq %1, %%mm0 \n\t"
284 "movq 8%1, %%mm2 \n\t"
285 "movq %%mm0, %%mm1 \n\t"
286 "movq %%mm2, %%mm3 \n\t"
287 "psrlq $1, %%mm0 \n\t"
288 "psrlq $1, %%mm2 \n\t"
289 "pand %%mm7, %%mm0 \n\t"
290 "pand %%mm7, %%mm2 \n\t"
291 "pand %%mm6, %%mm1 \n\t"
292 "pand %%mm6, %%mm3 \n\t"
293 "por %%mm1, %%mm0 \n\t"
294 "por %%mm3, %%mm2 \n\t"
295 MOVNTQ" %%mm0, %0 \n\t"
296 MOVNTQ" %%mm2, 8%0"
297 :"=m"(*d)
298 :"m"(*s)
300 d+=16;
301 s+=16;
303 __asm__ volatile(SFENCE:::"memory");
304 __asm__ volatile(EMMS:::"memory");
305 #endif
306 mm_end = end - 3;
307 while (s < mm_end) {
308 register uint32_t x= *((const uint32_t*)s);
309 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
310 s+=4;
311 d+=4;
313 if (s < end) {
314 register uint16_t x= *((const uint16_t*)s);
315 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
319 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
321 const uint8_t *s = src;
322 const uint8_t *end;
323 #if HAVE_MMX
324 const uint8_t *mm_end;
325 #endif
326 uint16_t *d = (uint16_t *)dst;
327 end = s + src_size;
328 #if HAVE_MMX
329 mm_end = end - 15;
330 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
331 __asm__ volatile(
332 "movq %3, %%mm5 \n\t"
333 "movq %4, %%mm6 \n\t"
334 "movq %5, %%mm7 \n\t"
335 "jmp 2f \n\t"
336 ASMALIGN(4)
337 "1: \n\t"
338 PREFETCH" 32(%1) \n\t"
339 "movd (%1), %%mm0 \n\t"
340 "movd 4(%1), %%mm3 \n\t"
341 "punpckldq 8(%1), %%mm0 \n\t"
342 "punpckldq 12(%1), %%mm3 \n\t"
343 "movq %%mm0, %%mm1 \n\t"
344 "movq %%mm3, %%mm4 \n\t"
345 "pand %%mm6, %%mm0 \n\t"
346 "pand %%mm6, %%mm3 \n\t"
347 "pmaddwd %%mm7, %%mm0 \n\t"
348 "pmaddwd %%mm7, %%mm3 \n\t"
349 "pand %%mm5, %%mm1 \n\t"
350 "pand %%mm5, %%mm4 \n\t"
351 "por %%mm1, %%mm0 \n\t"
352 "por %%mm4, %%mm3 \n\t"
353 "psrld $5, %%mm0 \n\t"
354 "pslld $11, %%mm3 \n\t"
355 "por %%mm3, %%mm0 \n\t"
356 MOVNTQ" %%mm0, (%0) \n\t"
357 "add $16, %1 \n\t"
358 "add $8, %0 \n\t"
359 "2: \n\t"
360 "cmp %2, %1 \n\t"
361 " jb 1b \n\t"
362 : "+r" (d), "+r"(s)
363 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
365 #else
366 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
367 __asm__ volatile(
368 "movq %0, %%mm7 \n\t"
369 "movq %1, %%mm6 \n\t"
370 ::"m"(red_16mask),"m"(green_16mask));
371 while (s < mm_end) {
372 __asm__ volatile(
373 PREFETCH" 32%1 \n\t"
374 "movd %1, %%mm0 \n\t"
375 "movd 4%1, %%mm3 \n\t"
376 "punpckldq 8%1, %%mm0 \n\t"
377 "punpckldq 12%1, %%mm3 \n\t"
378 "movq %%mm0, %%mm1 \n\t"
379 "movq %%mm0, %%mm2 \n\t"
380 "movq %%mm3, %%mm4 \n\t"
381 "movq %%mm3, %%mm5 \n\t"
382 "psrlq $3, %%mm0 \n\t"
383 "psrlq $3, %%mm3 \n\t"
384 "pand %2, %%mm0 \n\t"
385 "pand %2, %%mm3 \n\t"
386 "psrlq $5, %%mm1 \n\t"
387 "psrlq $5, %%mm4 \n\t"
388 "pand %%mm6, %%mm1 \n\t"
389 "pand %%mm6, %%mm4 \n\t"
390 "psrlq $8, %%mm2 \n\t"
391 "psrlq $8, %%mm5 \n\t"
392 "pand %%mm7, %%mm2 \n\t"
393 "pand %%mm7, %%mm5 \n\t"
394 "por %%mm1, %%mm0 \n\t"
395 "por %%mm4, %%mm3 \n\t"
396 "por %%mm2, %%mm0 \n\t"
397 "por %%mm5, %%mm3 \n\t"
398 "psllq $16, %%mm3 \n\t"
399 "por %%mm3, %%mm0 \n\t"
400 MOVNTQ" %%mm0, %0 \n\t"
401 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
402 d += 4;
403 s += 16;
405 #endif
406 __asm__ volatile(SFENCE:::"memory");
407 __asm__ volatile(EMMS:::"memory");
408 #endif
409 while (s < end) {
410 register int rgb = *(const uint32_t*)s; s += 4;
411 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
415 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
417 const uint8_t *s = src;
418 const uint8_t *end;
419 #if HAVE_MMX
420 const uint8_t *mm_end;
421 #endif
422 uint16_t *d = (uint16_t *)dst;
423 end = s + src_size;
424 #if HAVE_MMX
425 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
426 __asm__ volatile(
427 "movq %0, %%mm7 \n\t"
428 "movq %1, %%mm6 \n\t"
429 ::"m"(red_16mask),"m"(green_16mask));
430 mm_end = end - 15;
431 while (s < mm_end) {
432 __asm__ volatile(
433 PREFETCH" 32%1 \n\t"
434 "movd %1, %%mm0 \n\t"
435 "movd 4%1, %%mm3 \n\t"
436 "punpckldq 8%1, %%mm0 \n\t"
437 "punpckldq 12%1, %%mm3 \n\t"
438 "movq %%mm0, %%mm1 \n\t"
439 "movq %%mm0, %%mm2 \n\t"
440 "movq %%mm3, %%mm4 \n\t"
441 "movq %%mm3, %%mm5 \n\t"
442 "psllq $8, %%mm0 \n\t"
443 "psllq $8, %%mm3 \n\t"
444 "pand %%mm7, %%mm0 \n\t"
445 "pand %%mm7, %%mm3 \n\t"
446 "psrlq $5, %%mm1 \n\t"
447 "psrlq $5, %%mm4 \n\t"
448 "pand %%mm6, %%mm1 \n\t"
449 "pand %%mm6, %%mm4 \n\t"
450 "psrlq $19, %%mm2 \n\t"
451 "psrlq $19, %%mm5 \n\t"
452 "pand %2, %%mm2 \n\t"
453 "pand %2, %%mm5 \n\t"
454 "por %%mm1, %%mm0 \n\t"
455 "por %%mm4, %%mm3 \n\t"
456 "por %%mm2, %%mm0 \n\t"
457 "por %%mm5, %%mm3 \n\t"
458 "psllq $16, %%mm3 \n\t"
459 "por %%mm3, %%mm0 \n\t"
460 MOVNTQ" %%mm0, %0 \n\t"
461 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
462 d += 4;
463 s += 16;
465 __asm__ volatile(SFENCE:::"memory");
466 __asm__ volatile(EMMS:::"memory");
467 #endif
468 while (s < end) {
469 register int rgb = *(const uint32_t*)s; s += 4;
470 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
474 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
476 const uint8_t *s = src;
477 const uint8_t *end;
478 #if HAVE_MMX
479 const uint8_t *mm_end;
480 #endif
481 uint16_t *d = (uint16_t *)dst;
482 end = s + src_size;
483 #if HAVE_MMX
484 mm_end = end - 15;
485 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster)
486 __asm__ volatile(
487 "movq %3, %%mm5 \n\t"
488 "movq %4, %%mm6 \n\t"
489 "movq %5, %%mm7 \n\t"
490 "jmp 2f \n\t"
491 ASMALIGN(4)
492 "1: \n\t"
493 PREFETCH" 32(%1) \n\t"
494 "movd (%1), %%mm0 \n\t"
495 "movd 4(%1), %%mm3 \n\t"
496 "punpckldq 8(%1), %%mm0 \n\t"
497 "punpckldq 12(%1), %%mm3 \n\t"
498 "movq %%mm0, %%mm1 \n\t"
499 "movq %%mm3, %%mm4 \n\t"
500 "pand %%mm6, %%mm0 \n\t"
501 "pand %%mm6, %%mm3 \n\t"
502 "pmaddwd %%mm7, %%mm0 \n\t"
503 "pmaddwd %%mm7, %%mm3 \n\t"
504 "pand %%mm5, %%mm1 \n\t"
505 "pand %%mm5, %%mm4 \n\t"
506 "por %%mm1, %%mm0 \n\t"
507 "por %%mm4, %%mm3 \n\t"
508 "psrld $6, %%mm0 \n\t"
509 "pslld $10, %%mm3 \n\t"
510 "por %%mm3, %%mm0 \n\t"
511 MOVNTQ" %%mm0, (%0) \n\t"
512 "add $16, %1 \n\t"
513 "add $8, %0 \n\t"
514 "2: \n\t"
515 "cmp %2, %1 \n\t"
516 " jb 1b \n\t"
517 : "+r" (d), "+r"(s)
518 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
520 #else
521 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
522 __asm__ volatile(
523 "movq %0, %%mm7 \n\t"
524 "movq %1, %%mm6 \n\t"
525 ::"m"(red_15mask),"m"(green_15mask));
526 while (s < mm_end) {
527 __asm__ volatile(
528 PREFETCH" 32%1 \n\t"
529 "movd %1, %%mm0 \n\t"
530 "movd 4%1, %%mm3 \n\t"
531 "punpckldq 8%1, %%mm0 \n\t"
532 "punpckldq 12%1, %%mm3 \n\t"
533 "movq %%mm0, %%mm1 \n\t"
534 "movq %%mm0, %%mm2 \n\t"
535 "movq %%mm3, %%mm4 \n\t"
536 "movq %%mm3, %%mm5 \n\t"
537 "psrlq $3, %%mm0 \n\t"
538 "psrlq $3, %%mm3 \n\t"
539 "pand %2, %%mm0 \n\t"
540 "pand %2, %%mm3 \n\t"
541 "psrlq $6, %%mm1 \n\t"
542 "psrlq $6, %%mm4 \n\t"
543 "pand %%mm6, %%mm1 \n\t"
544 "pand %%mm6, %%mm4 \n\t"
545 "psrlq $9, %%mm2 \n\t"
546 "psrlq $9, %%mm5 \n\t"
547 "pand %%mm7, %%mm2 \n\t"
548 "pand %%mm7, %%mm5 \n\t"
549 "por %%mm1, %%mm0 \n\t"
550 "por %%mm4, %%mm3 \n\t"
551 "por %%mm2, %%mm0 \n\t"
552 "por %%mm5, %%mm3 \n\t"
553 "psllq $16, %%mm3 \n\t"
554 "por %%mm3, %%mm0 \n\t"
555 MOVNTQ" %%mm0, %0 \n\t"
556 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
557 d += 4;
558 s += 16;
560 #endif
561 __asm__ volatile(SFENCE:::"memory");
562 __asm__ volatile(EMMS:::"memory");
563 #endif
564 while (s < end) {
565 register int rgb = *(const uint32_t*)s; s += 4;
566 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
570 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
572 const uint8_t *s = src;
573 const uint8_t *end;
574 #if HAVE_MMX
575 const uint8_t *mm_end;
576 #endif
577 uint16_t *d = (uint16_t *)dst;
578 end = s + src_size;
579 #if HAVE_MMX
580 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
581 __asm__ volatile(
582 "movq %0, %%mm7 \n\t"
583 "movq %1, %%mm6 \n\t"
584 ::"m"(red_15mask),"m"(green_15mask));
585 mm_end = end - 15;
586 while (s < mm_end) {
587 __asm__ volatile(
588 PREFETCH" 32%1 \n\t"
589 "movd %1, %%mm0 \n\t"
590 "movd 4%1, %%mm3 \n\t"
591 "punpckldq 8%1, %%mm0 \n\t"
592 "punpckldq 12%1, %%mm3 \n\t"
593 "movq %%mm0, %%mm1 \n\t"
594 "movq %%mm0, %%mm2 \n\t"
595 "movq %%mm3, %%mm4 \n\t"
596 "movq %%mm3, %%mm5 \n\t"
597 "psllq $7, %%mm0 \n\t"
598 "psllq $7, %%mm3 \n\t"
599 "pand %%mm7, %%mm0 \n\t"
600 "pand %%mm7, %%mm3 \n\t"
601 "psrlq $6, %%mm1 \n\t"
602 "psrlq $6, %%mm4 \n\t"
603 "pand %%mm6, %%mm1 \n\t"
604 "pand %%mm6, %%mm4 \n\t"
605 "psrlq $19, %%mm2 \n\t"
606 "psrlq $19, %%mm5 \n\t"
607 "pand %2, %%mm2 \n\t"
608 "pand %2, %%mm5 \n\t"
609 "por %%mm1, %%mm0 \n\t"
610 "por %%mm4, %%mm3 \n\t"
611 "por %%mm2, %%mm0 \n\t"
612 "por %%mm5, %%mm3 \n\t"
613 "psllq $16, %%mm3 \n\t"
614 "por %%mm3, %%mm0 \n\t"
615 MOVNTQ" %%mm0, %0 \n\t"
616 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
617 d += 4;
618 s += 16;
620 __asm__ volatile(SFENCE:::"memory");
621 __asm__ volatile(EMMS:::"memory");
622 #endif
623 while (s < end) {
624 register int rgb = *(const uint32_t*)s; s += 4;
625 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
629 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
631 const uint8_t *s = src;
632 const uint8_t *end;
633 #if HAVE_MMX
634 const uint8_t *mm_end;
635 #endif
636 uint16_t *d = (uint16_t *)dst;
637 end = s + src_size;
638 #if HAVE_MMX
639 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
640 __asm__ volatile(
641 "movq %0, %%mm7 \n\t"
642 "movq %1, %%mm6 \n\t"
643 ::"m"(red_16mask),"m"(green_16mask));
644 mm_end = end - 11;
645 while (s < mm_end) {
646 __asm__ volatile(
647 PREFETCH" 32%1 \n\t"
648 "movd %1, %%mm0 \n\t"
649 "movd 3%1, %%mm3 \n\t"
650 "punpckldq 6%1, %%mm0 \n\t"
651 "punpckldq 9%1, %%mm3 \n\t"
652 "movq %%mm0, %%mm1 \n\t"
653 "movq %%mm0, %%mm2 \n\t"
654 "movq %%mm3, %%mm4 \n\t"
655 "movq %%mm3, %%mm5 \n\t"
656 "psrlq $3, %%mm0 \n\t"
657 "psrlq $3, %%mm3 \n\t"
658 "pand %2, %%mm0 \n\t"
659 "pand %2, %%mm3 \n\t"
660 "psrlq $5, %%mm1 \n\t"
661 "psrlq $5, %%mm4 \n\t"
662 "pand %%mm6, %%mm1 \n\t"
663 "pand %%mm6, %%mm4 \n\t"
664 "psrlq $8, %%mm2 \n\t"
665 "psrlq $8, %%mm5 \n\t"
666 "pand %%mm7, %%mm2 \n\t"
667 "pand %%mm7, %%mm5 \n\t"
668 "por %%mm1, %%mm0 \n\t"
669 "por %%mm4, %%mm3 \n\t"
670 "por %%mm2, %%mm0 \n\t"
671 "por %%mm5, %%mm3 \n\t"
672 "psllq $16, %%mm3 \n\t"
673 "por %%mm3, %%mm0 \n\t"
674 MOVNTQ" %%mm0, %0 \n\t"
675 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
676 d += 4;
677 s += 12;
679 __asm__ volatile(SFENCE:::"memory");
680 __asm__ volatile(EMMS:::"memory");
681 #endif
682 while (s < end) {
683 const int b = *s++;
684 const int g = *s++;
685 const int r = *s++;
686 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
690 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
692 const uint8_t *s = src;
693 const uint8_t *end;
694 #if HAVE_MMX
695 const uint8_t *mm_end;
696 #endif
697 uint16_t *d = (uint16_t *)dst;
698 end = s + src_size;
699 #if HAVE_MMX
700 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
701 __asm__ volatile(
702 "movq %0, %%mm7 \n\t"
703 "movq %1, %%mm6 \n\t"
704 ::"m"(red_16mask),"m"(green_16mask));
705 mm_end = end - 15;
706 while (s < mm_end) {
707 __asm__ volatile(
708 PREFETCH" 32%1 \n\t"
709 "movd %1, %%mm0 \n\t"
710 "movd 3%1, %%mm3 \n\t"
711 "punpckldq 6%1, %%mm0 \n\t"
712 "punpckldq 9%1, %%mm3 \n\t"
713 "movq %%mm0, %%mm1 \n\t"
714 "movq %%mm0, %%mm2 \n\t"
715 "movq %%mm3, %%mm4 \n\t"
716 "movq %%mm3, %%mm5 \n\t"
717 "psllq $8, %%mm0 \n\t"
718 "psllq $8, %%mm3 \n\t"
719 "pand %%mm7, %%mm0 \n\t"
720 "pand %%mm7, %%mm3 \n\t"
721 "psrlq $5, %%mm1 \n\t"
722 "psrlq $5, %%mm4 \n\t"
723 "pand %%mm6, %%mm1 \n\t"
724 "pand %%mm6, %%mm4 \n\t"
725 "psrlq $19, %%mm2 \n\t"
726 "psrlq $19, %%mm5 \n\t"
727 "pand %2, %%mm2 \n\t"
728 "pand %2, %%mm5 \n\t"
729 "por %%mm1, %%mm0 \n\t"
730 "por %%mm4, %%mm3 \n\t"
731 "por %%mm2, %%mm0 \n\t"
732 "por %%mm5, %%mm3 \n\t"
733 "psllq $16, %%mm3 \n\t"
734 "por %%mm3, %%mm0 \n\t"
735 MOVNTQ" %%mm0, %0 \n\t"
736 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
737 d += 4;
738 s += 12;
740 __asm__ volatile(SFENCE:::"memory");
741 __asm__ volatile(EMMS:::"memory");
742 #endif
743 while (s < end) {
744 const int r = *s++;
745 const int g = *s++;
746 const int b = *s++;
747 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
751 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
753 const uint8_t *s = src;
754 const uint8_t *end;
755 #if HAVE_MMX
756 const uint8_t *mm_end;
757 #endif
758 uint16_t *d = (uint16_t *)dst;
759 end = s + src_size;
760 #if HAVE_MMX
761 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
762 __asm__ volatile(
763 "movq %0, %%mm7 \n\t"
764 "movq %1, %%mm6 \n\t"
765 ::"m"(red_15mask),"m"(green_15mask));
766 mm_end = end - 11;
767 while (s < mm_end) {
768 __asm__ volatile(
769 PREFETCH" 32%1 \n\t"
770 "movd %1, %%mm0 \n\t"
771 "movd 3%1, %%mm3 \n\t"
772 "punpckldq 6%1, %%mm0 \n\t"
773 "punpckldq 9%1, %%mm3 \n\t"
774 "movq %%mm0, %%mm1 \n\t"
775 "movq %%mm0, %%mm2 \n\t"
776 "movq %%mm3, %%mm4 \n\t"
777 "movq %%mm3, %%mm5 \n\t"
778 "psrlq $3, %%mm0 \n\t"
779 "psrlq $3, %%mm3 \n\t"
780 "pand %2, %%mm0 \n\t"
781 "pand %2, %%mm3 \n\t"
782 "psrlq $6, %%mm1 \n\t"
783 "psrlq $6, %%mm4 \n\t"
784 "pand %%mm6, %%mm1 \n\t"
785 "pand %%mm6, %%mm4 \n\t"
786 "psrlq $9, %%mm2 \n\t"
787 "psrlq $9, %%mm5 \n\t"
788 "pand %%mm7, %%mm2 \n\t"
789 "pand %%mm7, %%mm5 \n\t"
790 "por %%mm1, %%mm0 \n\t"
791 "por %%mm4, %%mm3 \n\t"
792 "por %%mm2, %%mm0 \n\t"
793 "por %%mm5, %%mm3 \n\t"
794 "psllq $16, %%mm3 \n\t"
795 "por %%mm3, %%mm0 \n\t"
796 MOVNTQ" %%mm0, %0 \n\t"
797 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
798 d += 4;
799 s += 12;
801 __asm__ volatile(SFENCE:::"memory");
802 __asm__ volatile(EMMS:::"memory");
803 #endif
804 while (s < end) {
805 const int b = *s++;
806 const int g = *s++;
807 const int r = *s++;
808 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
812 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
814 const uint8_t *s = src;
815 const uint8_t *end;
816 #if HAVE_MMX
817 const uint8_t *mm_end;
818 #endif
819 uint16_t *d = (uint16_t *)dst;
820 end = s + src_size;
821 #if HAVE_MMX
822 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
823 __asm__ volatile(
824 "movq %0, %%mm7 \n\t"
825 "movq %1, %%mm6 \n\t"
826 ::"m"(red_15mask),"m"(green_15mask));
827 mm_end = end - 15;
828 while (s < mm_end) {
829 __asm__ volatile(
830 PREFETCH" 32%1 \n\t"
831 "movd %1, %%mm0 \n\t"
832 "movd 3%1, %%mm3 \n\t"
833 "punpckldq 6%1, %%mm0 \n\t"
834 "punpckldq 9%1, %%mm3 \n\t"
835 "movq %%mm0, %%mm1 \n\t"
836 "movq %%mm0, %%mm2 \n\t"
837 "movq %%mm3, %%mm4 \n\t"
838 "movq %%mm3, %%mm5 \n\t"
839 "psllq $7, %%mm0 \n\t"
840 "psllq $7, %%mm3 \n\t"
841 "pand %%mm7, %%mm0 \n\t"
842 "pand %%mm7, %%mm3 \n\t"
843 "psrlq $6, %%mm1 \n\t"
844 "psrlq $6, %%mm4 \n\t"
845 "pand %%mm6, %%mm1 \n\t"
846 "pand %%mm6, %%mm4 \n\t"
847 "psrlq $19, %%mm2 \n\t"
848 "psrlq $19, %%mm5 \n\t"
849 "pand %2, %%mm2 \n\t"
850 "pand %2, %%mm5 \n\t"
851 "por %%mm1, %%mm0 \n\t"
852 "por %%mm4, %%mm3 \n\t"
853 "por %%mm2, %%mm0 \n\t"
854 "por %%mm5, %%mm3 \n\t"
855 "psllq $16, %%mm3 \n\t"
856 "por %%mm3, %%mm0 \n\t"
857 MOVNTQ" %%mm0, %0 \n\t"
858 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
859 d += 4;
860 s += 12;
862 __asm__ volatile(SFENCE:::"memory");
863 __asm__ volatile(EMMS:::"memory");
864 #endif
865 while (s < end) {
866 const int r = *s++;
867 const int g = *s++;
868 const int b = *s++;
869 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
874 I use less accurate approximation here by simply left-shifting the input
875 value and filling the low order bits with zeroes. This method improves PNG
876 compression but this scheme cannot reproduce white exactly, since it does
877 not generate an all-ones maximum value; the net effect is to darken the
878 image slightly.
880 The better method should be "left bit replication":
882 4 3 2 1 0
883 ---------
884 1 1 0 1 1
886 7 6 5 4 3 2 1 0
887 ----------------
888 1 1 0 1 1 1 1 0
889 |=======| |===|
890 | leftmost bits repeated to fill open bits
892 original bits
894 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
896 const uint16_t *end;
897 #if HAVE_MMX
898 const uint16_t *mm_end;
899 #endif
900 uint8_t *d = dst;
901 const uint16_t *s = (const uint16_t*)src;
902 end = s + src_size/2;
903 #if HAVE_MMX
904 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
905 mm_end = end - 7;
906 while (s < mm_end) {
907 __asm__ volatile(
908 PREFETCH" 32%1 \n\t"
909 "movq %1, %%mm0 \n\t"
910 "movq %1, %%mm1 \n\t"
911 "movq %1, %%mm2 \n\t"
912 "pand %2, %%mm0 \n\t"
913 "pand %3, %%mm1 \n\t"
914 "pand %4, %%mm2 \n\t"
915 "psllq $3, %%mm0 \n\t"
916 "psrlq $2, %%mm1 \n\t"
917 "psrlq $7, %%mm2 \n\t"
918 "movq %%mm0, %%mm3 \n\t"
919 "movq %%mm1, %%mm4 \n\t"
920 "movq %%mm2, %%mm5 \n\t"
921 "punpcklwd %5, %%mm0 \n\t"
922 "punpcklwd %5, %%mm1 \n\t"
923 "punpcklwd %5, %%mm2 \n\t"
924 "punpckhwd %5, %%mm3 \n\t"
925 "punpckhwd %5, %%mm4 \n\t"
926 "punpckhwd %5, %%mm5 \n\t"
927 "psllq $8, %%mm1 \n\t"
928 "psllq $16, %%mm2 \n\t"
929 "por %%mm1, %%mm0 \n\t"
930 "por %%mm2, %%mm0 \n\t"
931 "psllq $8, %%mm4 \n\t"
932 "psllq $16, %%mm5 \n\t"
933 "por %%mm4, %%mm3 \n\t"
934 "por %%mm5, %%mm3 \n\t"
936 "movq %%mm0, %%mm6 \n\t"
937 "movq %%mm3, %%mm7 \n\t"
939 "movq 8%1, %%mm0 \n\t"
940 "movq 8%1, %%mm1 \n\t"
941 "movq 8%1, %%mm2 \n\t"
942 "pand %2, %%mm0 \n\t"
943 "pand %3, %%mm1 \n\t"
944 "pand %4, %%mm2 \n\t"
945 "psllq $3, %%mm0 \n\t"
946 "psrlq $2, %%mm1 \n\t"
947 "psrlq $7, %%mm2 \n\t"
948 "movq %%mm0, %%mm3 \n\t"
949 "movq %%mm1, %%mm4 \n\t"
950 "movq %%mm2, %%mm5 \n\t"
951 "punpcklwd %5, %%mm0 \n\t"
952 "punpcklwd %5, %%mm1 \n\t"
953 "punpcklwd %5, %%mm2 \n\t"
954 "punpckhwd %5, %%mm3 \n\t"
955 "punpckhwd %5, %%mm4 \n\t"
956 "punpckhwd %5, %%mm5 \n\t"
957 "psllq $8, %%mm1 \n\t"
958 "psllq $16, %%mm2 \n\t"
959 "por %%mm1, %%mm0 \n\t"
960 "por %%mm2, %%mm0 \n\t"
961 "psllq $8, %%mm4 \n\t"
962 "psllq $16, %%mm5 \n\t"
963 "por %%mm4, %%mm3 \n\t"
964 "por %%mm5, %%mm3 \n\t"
966 :"=m"(*d)
967 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
968 :"memory");
969 /* borrowed 32 to 24 */
970 __asm__ volatile(
971 "movq %%mm0, %%mm4 \n\t"
972 "movq %%mm3, %%mm5 \n\t"
973 "movq %%mm6, %%mm0 \n\t"
974 "movq %%mm7, %%mm1 \n\t"
976 "movq %%mm4, %%mm6 \n\t"
977 "movq %%mm5, %%mm7 \n\t"
978 "movq %%mm0, %%mm2 \n\t"
979 "movq %%mm1, %%mm3 \n\t"
981 "psrlq $8, %%mm2 \n\t"
982 "psrlq $8, %%mm3 \n\t"
983 "psrlq $8, %%mm6 \n\t"
984 "psrlq $8, %%mm7 \n\t"
985 "pand %2, %%mm0 \n\t"
986 "pand %2, %%mm1 \n\t"
987 "pand %2, %%mm4 \n\t"
988 "pand %2, %%mm5 \n\t"
989 "pand %3, %%mm2 \n\t"
990 "pand %3, %%mm3 \n\t"
991 "pand %3, %%mm6 \n\t"
992 "pand %3, %%mm7 \n\t"
993 "por %%mm2, %%mm0 \n\t"
994 "por %%mm3, %%mm1 \n\t"
995 "por %%mm6, %%mm4 \n\t"
996 "por %%mm7, %%mm5 \n\t"
998 "movq %%mm1, %%mm2 \n\t"
999 "movq %%mm4, %%mm3 \n\t"
1000 "psllq $48, %%mm2 \n\t"
1001 "psllq $32, %%mm3 \n\t"
1002 "pand %4, %%mm2 \n\t"
1003 "pand %5, %%mm3 \n\t"
1004 "por %%mm2, %%mm0 \n\t"
1005 "psrlq $16, %%mm1 \n\t"
1006 "psrlq $32, %%mm4 \n\t"
1007 "psllq $16, %%mm5 \n\t"
1008 "por %%mm3, %%mm1 \n\t"
1009 "pand %6, %%mm5 \n\t"
1010 "por %%mm5, %%mm4 \n\t"
1012 MOVNTQ" %%mm0, %0 \n\t"
1013 MOVNTQ" %%mm1, 8%0 \n\t"
1014 MOVNTQ" %%mm4, 16%0"
1016 :"=m"(*d)
1017 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1018 :"memory");
1019 d += 24;
1020 s += 8;
1022 __asm__ volatile(SFENCE:::"memory");
1023 __asm__ volatile(EMMS:::"memory");
1024 #endif
1025 while (s < end) {
1026 register uint16_t bgr;
1027 bgr = *s++;
1028 *d++ = (bgr&0x1F)<<3;
1029 *d++ = (bgr&0x3E0)>>2;
1030 *d++ = (bgr&0x7C00)>>7;
1034 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1036 const uint16_t *end;
1037 #if HAVE_MMX
1038 const uint16_t *mm_end;
1039 #endif
1040 uint8_t *d = (uint8_t *)dst;
1041 const uint16_t *s = (const uint16_t *)src;
1042 end = s + src_size/2;
1043 #if HAVE_MMX
1044 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1045 mm_end = end - 7;
1046 while (s < mm_end) {
1047 __asm__ volatile(
1048 PREFETCH" 32%1 \n\t"
1049 "movq %1, %%mm0 \n\t"
1050 "movq %1, %%mm1 \n\t"
1051 "movq %1, %%mm2 \n\t"
1052 "pand %2, %%mm0 \n\t"
1053 "pand %3, %%mm1 \n\t"
1054 "pand %4, %%mm2 \n\t"
1055 "psllq $3, %%mm0 \n\t"
1056 "psrlq $3, %%mm1 \n\t"
1057 "psrlq $8, %%mm2 \n\t"
1058 "movq %%mm0, %%mm3 \n\t"
1059 "movq %%mm1, %%mm4 \n\t"
1060 "movq %%mm2, %%mm5 \n\t"
1061 "punpcklwd %5, %%mm0 \n\t"
1062 "punpcklwd %5, %%mm1 \n\t"
1063 "punpcklwd %5, %%mm2 \n\t"
1064 "punpckhwd %5, %%mm3 \n\t"
1065 "punpckhwd %5, %%mm4 \n\t"
1066 "punpckhwd %5, %%mm5 \n\t"
1067 "psllq $8, %%mm1 \n\t"
1068 "psllq $16, %%mm2 \n\t"
1069 "por %%mm1, %%mm0 \n\t"
1070 "por %%mm2, %%mm0 \n\t"
1071 "psllq $8, %%mm4 \n\t"
1072 "psllq $16, %%mm5 \n\t"
1073 "por %%mm4, %%mm3 \n\t"
1074 "por %%mm5, %%mm3 \n\t"
1076 "movq %%mm0, %%mm6 \n\t"
1077 "movq %%mm3, %%mm7 \n\t"
1079 "movq 8%1, %%mm0 \n\t"
1080 "movq 8%1, %%mm1 \n\t"
1081 "movq 8%1, %%mm2 \n\t"
1082 "pand %2, %%mm0 \n\t"
1083 "pand %3, %%mm1 \n\t"
1084 "pand %4, %%mm2 \n\t"
1085 "psllq $3, %%mm0 \n\t"
1086 "psrlq $3, %%mm1 \n\t"
1087 "psrlq $8, %%mm2 \n\t"
1088 "movq %%mm0, %%mm3 \n\t"
1089 "movq %%mm1, %%mm4 \n\t"
1090 "movq %%mm2, %%mm5 \n\t"
1091 "punpcklwd %5, %%mm0 \n\t"
1092 "punpcklwd %5, %%mm1 \n\t"
1093 "punpcklwd %5, %%mm2 \n\t"
1094 "punpckhwd %5, %%mm3 \n\t"
1095 "punpckhwd %5, %%mm4 \n\t"
1096 "punpckhwd %5, %%mm5 \n\t"
1097 "psllq $8, %%mm1 \n\t"
1098 "psllq $16, %%mm2 \n\t"
1099 "por %%mm1, %%mm0 \n\t"
1100 "por %%mm2, %%mm0 \n\t"
1101 "psllq $8, %%mm4 \n\t"
1102 "psllq $16, %%mm5 \n\t"
1103 "por %%mm4, %%mm3 \n\t"
1104 "por %%mm5, %%mm3 \n\t"
1105 :"=m"(*d)
1106 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107 :"memory");
1108 /* borrowed 32 to 24 */
1109 __asm__ volatile(
1110 "movq %%mm0, %%mm4 \n\t"
1111 "movq %%mm3, %%mm5 \n\t"
1112 "movq %%mm6, %%mm0 \n\t"
1113 "movq %%mm7, %%mm1 \n\t"
1115 "movq %%mm4, %%mm6 \n\t"
1116 "movq %%mm5, %%mm7 \n\t"
1117 "movq %%mm0, %%mm2 \n\t"
1118 "movq %%mm1, %%mm3 \n\t"
1120 "psrlq $8, %%mm2 \n\t"
1121 "psrlq $8, %%mm3 \n\t"
1122 "psrlq $8, %%mm6 \n\t"
1123 "psrlq $8, %%mm7 \n\t"
1124 "pand %2, %%mm0 \n\t"
1125 "pand %2, %%mm1 \n\t"
1126 "pand %2, %%mm4 \n\t"
1127 "pand %2, %%mm5 \n\t"
1128 "pand %3, %%mm2 \n\t"
1129 "pand %3, %%mm3 \n\t"
1130 "pand %3, %%mm6 \n\t"
1131 "pand %3, %%mm7 \n\t"
1132 "por %%mm2, %%mm0 \n\t"
1133 "por %%mm3, %%mm1 \n\t"
1134 "por %%mm6, %%mm4 \n\t"
1135 "por %%mm7, %%mm5 \n\t"
1137 "movq %%mm1, %%mm2 \n\t"
1138 "movq %%mm4, %%mm3 \n\t"
1139 "psllq $48, %%mm2 \n\t"
1140 "psllq $32, %%mm3 \n\t"
1141 "pand %4, %%mm2 \n\t"
1142 "pand %5, %%mm3 \n\t"
1143 "por %%mm2, %%mm0 \n\t"
1144 "psrlq $16, %%mm1 \n\t"
1145 "psrlq $32, %%mm4 \n\t"
1146 "psllq $16, %%mm5 \n\t"
1147 "por %%mm3, %%mm1 \n\t"
1148 "pand %6, %%mm5 \n\t"
1149 "por %%mm5, %%mm4 \n\t"
1151 MOVNTQ" %%mm0, %0 \n\t"
1152 MOVNTQ" %%mm1, 8%0 \n\t"
1153 MOVNTQ" %%mm4, 16%0"
1155 :"=m"(*d)
1156 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1157 :"memory");
1158 d += 24;
1159 s += 8;
1161 __asm__ volatile(SFENCE:::"memory");
1162 __asm__ volatile(EMMS:::"memory");
1163 #endif
1164 while (s < end) {
1165 register uint16_t bgr;
1166 bgr = *s++;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1174 * mm0 = 00 B3 00 B2 00 B1 00 B0
1175 * mm1 = 00 G3 00 G2 00 G1 00 G0
1176 * mm2 = 00 R3 00 R2 00 R1 00 R0
1177 * mm6 = FF FF FF FF FF FF FF FF
1178 * mm7 = 00 00 00 00 00 00 00 00
1180 #define PACK_RGB32 \
1181 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
1182 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
1183 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
1184 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
1185 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
1186 "movq %%mm0, %%mm3 \n\t" \
1187 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
1188 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
1189 MOVNTQ" %%mm0, %0 \n\t" \
1190 MOVNTQ" %%mm3, 8%0 \n\t" \
1192 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1194 const uint16_t *end;
1195 #if HAVE_MMX
1196 const uint16_t *mm_end;
1197 #endif
1198 uint8_t *d = dst;
1199 const uint16_t *s = (const uint16_t *)src;
1200 end = s + src_size/2;
1201 #if HAVE_MMX
1202 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1203 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1204 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1205 mm_end = end - 3;
1206 while (s < mm_end) {
1207 __asm__ volatile(
1208 PREFETCH" 32%1 \n\t"
1209 "movq %1, %%mm0 \n\t"
1210 "movq %1, %%mm1 \n\t"
1211 "movq %1, %%mm2 \n\t"
1212 "pand %2, %%mm0 \n\t"
1213 "pand %3, %%mm1 \n\t"
1214 "pand %4, %%mm2 \n\t"
1215 "psllq $3, %%mm0 \n\t"
1216 "psrlq $2, %%mm1 \n\t"
1217 "psrlq $7, %%mm2 \n\t"
1218 PACK_RGB32
1219 :"=m"(*d)
1220 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1221 :"memory");
1222 d += 16;
1223 s += 4;
1225 __asm__ volatile(SFENCE:::"memory");
1226 __asm__ volatile(EMMS:::"memory");
1227 #endif
1228 while (s < end) {
1229 register uint16_t bgr;
1230 bgr = *s++;
1231 #if HAVE_BIGENDIAN
1232 *d++ = 255;
1233 *d++ = (bgr&0x7C00)>>7;
1234 *d++ = (bgr&0x3E0)>>2;
1235 *d++ = (bgr&0x1F)<<3;
1236 #else
1237 *d++ = (bgr&0x1F)<<3;
1238 *d++ = (bgr&0x3E0)>>2;
1239 *d++ = (bgr&0x7C00)>>7;
1240 *d++ = 255;
1241 #endif
1245 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1247 const uint16_t *end;
1248 #if HAVE_MMX
1249 const uint16_t *mm_end;
1250 #endif
1251 uint8_t *d = dst;
1252 const uint16_t *s = (const uint16_t*)src;
1253 end = s + src_size/2;
1254 #if HAVE_MMX
1255 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1256 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1257 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1258 mm_end = end - 3;
1259 while (s < mm_end) {
1260 __asm__ volatile(
1261 PREFETCH" 32%1 \n\t"
1262 "movq %1, %%mm0 \n\t"
1263 "movq %1, %%mm1 \n\t"
1264 "movq %1, %%mm2 \n\t"
1265 "pand %2, %%mm0 \n\t"
1266 "pand %3, %%mm1 \n\t"
1267 "pand %4, %%mm2 \n\t"
1268 "psllq $3, %%mm0 \n\t"
1269 "psrlq $3, %%mm1 \n\t"
1270 "psrlq $8, %%mm2 \n\t"
1271 PACK_RGB32
1272 :"=m"(*d)
1273 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1274 :"memory");
1275 d += 16;
1276 s += 4;
1278 __asm__ volatile(SFENCE:::"memory");
1279 __asm__ volatile(EMMS:::"memory");
1280 #endif
1281 while (s < end) {
1282 register uint16_t bgr;
1283 bgr = *s++;
1284 #if HAVE_BIGENDIAN
1285 *d++ = 255;
1286 *d++ = (bgr&0xF800)>>8;
1287 *d++ = (bgr&0x7E0)>>3;
1288 *d++ = (bgr&0x1F)<<3;
1289 #else
1290 *d++ = (bgr&0x1F)<<3;
1291 *d++ = (bgr&0x7E0)>>3;
1292 *d++ = (bgr&0xF800)>>8;
1293 *d++ = 255;
1294 #endif
1298 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1300 x86_reg idx = 15 - src_size;
1301 const uint8_t *s = src-idx;
1302 uint8_t *d = dst-idx;
1303 #if HAVE_MMX
1304 __asm__ volatile(
1305 "test %0, %0 \n\t"
1306 "jns 2f \n\t"
1307 PREFETCH" (%1, %0) \n\t"
1308 "movq %3, %%mm7 \n\t"
1309 "pxor %4, %%mm7 \n\t"
1310 "movq %%mm7, %%mm6 \n\t"
1311 "pxor %5, %%mm7 \n\t"
1312 ASMALIGN(4)
1313 "1: \n\t"
1314 PREFETCH" 32(%1, %0) \n\t"
1315 "movq (%1, %0), %%mm0 \n\t"
1316 "movq 8(%1, %0), %%mm1 \n\t"
1317 # if HAVE_MMX2
1318 "pshufw $177, %%mm0, %%mm3 \n\t"
1319 "pshufw $177, %%mm1, %%mm5 \n\t"
1320 "pand %%mm7, %%mm0 \n\t"
1321 "pand %%mm6, %%mm3 \n\t"
1322 "pand %%mm7, %%mm1 \n\t"
1323 "pand %%mm6, %%mm5 \n\t"
1324 "por %%mm3, %%mm0 \n\t"
1325 "por %%mm5, %%mm1 \n\t"
1326 # else
1327 "movq %%mm0, %%mm2 \n\t"
1328 "movq %%mm1, %%mm4 \n\t"
1329 "pand %%mm7, %%mm0 \n\t"
1330 "pand %%mm6, %%mm2 \n\t"
1331 "pand %%mm7, %%mm1 \n\t"
1332 "pand %%mm6, %%mm4 \n\t"
1333 "movq %%mm2, %%mm3 \n\t"
1334 "movq %%mm4, %%mm5 \n\t"
1335 "pslld $16, %%mm2 \n\t"
1336 "psrld $16, %%mm3 \n\t"
1337 "pslld $16, %%mm4 \n\t"
1338 "psrld $16, %%mm5 \n\t"
1339 "por %%mm2, %%mm0 \n\t"
1340 "por %%mm4, %%mm1 \n\t"
1341 "por %%mm3, %%mm0 \n\t"
1342 "por %%mm5, %%mm1 \n\t"
1343 # endif
1344 MOVNTQ" %%mm0, (%2, %0) \n\t"
1345 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1346 "add $16, %0 \n\t"
1347 "js 1b \n\t"
1348 SFENCE" \n\t"
1349 EMMS" \n\t"
1350 "2: \n\t"
1351 : "+&r"(idx)
1352 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1353 : "memory");
1354 #endif
1355 for (; idx<15; idx+=4) {
1356 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1357 v &= 0xff00ff;
1358 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1362 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1364 unsigned i;
1365 #if HAVE_MMX
1366 x86_reg mmx_size= 23 - src_size;
1367 __asm__ volatile (
1368 "test %%"REG_a", %%"REG_a" \n\t"
1369 "jns 2f \n\t"
1370 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1371 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1372 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1373 ASMALIGN(4)
1374 "1: \n\t"
1375 PREFETCH" 32(%1, %%"REG_a") \n\t"
1376 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1377 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1378 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1379 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1380 "pand %%mm5, %%mm0 \n\t"
1381 "pand %%mm6, %%mm1 \n\t"
1382 "pand %%mm7, %%mm2 \n\t"
1383 "por %%mm0, %%mm1 \n\t"
1384 "por %%mm2, %%mm1 \n\t"
1385 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1386 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1387 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1388 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1389 "pand %%mm7, %%mm0 \n\t"
1390 "pand %%mm5, %%mm1 \n\t"
1391 "pand %%mm6, %%mm2 \n\t"
1392 "por %%mm0, %%mm1 \n\t"
1393 "por %%mm2, %%mm1 \n\t"
1394 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1395 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1396 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1397 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1398 "pand %%mm6, %%mm0 \n\t"
1399 "pand %%mm7, %%mm1 \n\t"
1400 "pand %%mm5, %%mm2 \n\t"
1401 "por %%mm0, %%mm1 \n\t"
1402 "por %%mm2, %%mm1 \n\t"
1403 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1404 "add $24, %%"REG_a" \n\t"
1405 " js 1b \n\t"
1406 "2: \n\t"
1407 : "+a" (mmx_size)
1408 : "r" (src-mmx_size), "r"(dst-mmx_size)
1411 __asm__ volatile(SFENCE:::"memory");
1412 __asm__ volatile(EMMS:::"memory");
1414 if (mmx_size==23) return; //finished, was multiple of 8
1416 src+= src_size;
1417 dst+= src_size;
1418 src_size= 23-mmx_size;
1419 src-= src_size;
1420 dst-= src_size;
1421 #endif
1422 for (i=0; i<src_size; i+=3) {
1423 register uint8_t x;
1424 x = src[i + 2];
1425 dst[i + 1] = src[i + 1];
1426 dst[i + 2] = src[i + 0];
1427 dst[i + 0] = x;
1431 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1432 long width, long height,
1433 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1435 long y;
1436 const x86_reg chromWidth= width>>1;
1437 for (y=0; y<height; y++) {
1438 #if HAVE_MMX
1439 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1440 __asm__ volatile(
1441 "xor %%"REG_a", %%"REG_a" \n\t"
1442 ASMALIGN(4)
1443 "1: \n\t"
1444 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1445 PREFETCH" 32(%2, %%"REG_a") \n\t"
1446 PREFETCH" 32(%3, %%"REG_a") \n\t"
1447 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1448 "movq %%mm0, %%mm2 \n\t" // U(0)
1449 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1450 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1451 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1453 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1454 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1455 "movq %%mm3, %%mm4 \n\t" // Y(0)
1456 "movq %%mm5, %%mm6 \n\t" // Y(8)
1457 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1458 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1459 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1460 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1462 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1463 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1464 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1465 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1467 "add $8, %%"REG_a" \n\t"
1468 "cmp %4, %%"REG_a" \n\t"
1469 " jb 1b \n\t"
1470 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1471 : "%"REG_a
1473 #else
1475 #if ARCH_ALPHA && HAVE_MVI
1476 #define pl2yuy2(n) \
1477 y1 = yc[n]; \
1478 y2 = yc2[n]; \
1479 u = uc[n]; \
1480 v = vc[n]; \
1481 __asm__("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1482 __asm__("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1483 __asm__("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1484 __asm__("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1485 yuv1 = (u << 8) + (v << 24); \
1486 yuv2 = yuv1 + y2; \
1487 yuv1 += y1; \
1488 qdst[n] = yuv1; \
1489 qdst2[n] = yuv2;
1491 int i;
1492 uint64_t *qdst = (uint64_t *) dst;
1493 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1494 const uint32_t *yc = (uint32_t *) ysrc;
1495 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1496 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1497 for (i = 0; i < chromWidth; i += 8) {
1498 uint64_t y1, y2, yuv1, yuv2;
1499 uint64_t u, v;
1500 /* Prefetch */
1501 __asm__("ldq $31,64(%0)" :: "r"(yc));
1502 __asm__("ldq $31,64(%0)" :: "r"(yc2));
1503 __asm__("ldq $31,64(%0)" :: "r"(uc));
1504 __asm__("ldq $31,64(%0)" :: "r"(vc));
1506 pl2yuy2(0);
1507 pl2yuy2(1);
1508 pl2yuy2(2);
1509 pl2yuy2(3);
1511 yc += 4;
1512 yc2 += 4;
1513 uc += 4;
1514 vc += 4;
1515 qdst += 4;
1516 qdst2 += 4;
1518 y++;
1519 ysrc += lumStride;
1520 dst += dstStride;
1522 #elif HAVE_FAST_64BIT
1523 int i;
1524 uint64_t *ldst = (uint64_t *) dst;
1525 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1526 for (i = 0; i < chromWidth; i += 2) {
1527 uint64_t k, l;
1528 k = yc[0] + (uc[0] << 8) +
1529 (yc[1] << 16) + (vc[0] << 24);
1530 l = yc[2] + (uc[1] << 8) +
1531 (yc[3] << 16) + (vc[1] << 24);
1532 *ldst++ = k + (l << 32);
1533 yc += 4;
1534 uc += 2;
1535 vc += 2;
1538 #else
1539 int i, *idst = (int32_t *) dst;
1540 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1541 for (i = 0; i < chromWidth; i++) {
1542 #if HAVE_BIGENDIAN
1543 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1544 (yc[1] << 8) + (vc[0] << 0);
1545 #else
1546 *idst++ = yc[0] + (uc[0] << 8) +
1547 (yc[1] << 16) + (vc[0] << 24);
1548 #endif
1549 yc += 2;
1550 uc++;
1551 vc++;
1553 #endif
1554 #endif
1555 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1556 usrc += chromStride;
1557 vsrc += chromStride;
1559 ysrc += lumStride;
1560 dst += dstStride;
1562 #if HAVE_MMX
1563 __asm__(EMMS" \n\t"
1564 SFENCE" \n\t"
1565 :::"memory");
1566 #endif
1570 * Height should be a multiple of 2 and width should be a multiple of 16.
1571 * (If this is a problem for anyone then tell me, and I will fix it.)
1573 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1574 long width, long height,
1575 long lumStride, long chromStride, long dstStride)
1577 //FIXME interpolate chroma
1578 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1581 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1582 long width, long height,
1583 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1585 long y;
1586 const x86_reg chromWidth= width>>1;
1587 for (y=0; y<height; y++) {
1588 #if HAVE_MMX
1589 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1590 __asm__ volatile(
1591 "xor %%"REG_a", %%"REG_a" \n\t"
1592 ASMALIGN(4)
1593 "1: \n\t"
1594 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1595 PREFETCH" 32(%2, %%"REG_a") \n\t"
1596 PREFETCH" 32(%3, %%"REG_a") \n\t"
1597 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1598 "movq %%mm0, %%mm2 \n\t" // U(0)
1599 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1600 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1601 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1603 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1604 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1605 "movq %%mm0, %%mm4 \n\t" // Y(0)
1606 "movq %%mm2, %%mm6 \n\t" // Y(8)
1607 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1608 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1609 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1610 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1612 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1613 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1614 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1615 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1617 "add $8, %%"REG_a" \n\t"
1618 "cmp %4, %%"REG_a" \n\t"
1619 " jb 1b \n\t"
1620 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1621 : "%"REG_a
1623 #else
1624 //FIXME adapt the Alpha ASM code from yv12->yuy2
1626 #if HAVE_FAST_64BIT
1627 int i;
1628 uint64_t *ldst = (uint64_t *) dst;
1629 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1630 for (i = 0; i < chromWidth; i += 2) {
1631 uint64_t k, l;
1632 k = uc[0] + (yc[0] << 8) +
1633 (vc[0] << 16) + (yc[1] << 24);
1634 l = uc[1] + (yc[2] << 8) +
1635 (vc[1] << 16) + (yc[3] << 24);
1636 *ldst++ = k + (l << 32);
1637 yc += 4;
1638 uc += 2;
1639 vc += 2;
1642 #else
1643 int i, *idst = (int32_t *) dst;
1644 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1645 for (i = 0; i < chromWidth; i++) {
1646 #if HAVE_BIGENDIAN
1647 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1648 (vc[0] << 8) + (yc[1] << 0);
1649 #else
1650 *idst++ = uc[0] + (yc[0] << 8) +
1651 (vc[0] << 16) + (yc[1] << 24);
1652 #endif
1653 yc += 2;
1654 uc++;
1655 vc++;
1657 #endif
1658 #endif
1659 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1660 usrc += chromStride;
1661 vsrc += chromStride;
1663 ysrc += lumStride;
1664 dst += dstStride;
1666 #if HAVE_MMX
1667 __asm__(EMMS" \n\t"
1668 SFENCE" \n\t"
1669 :::"memory");
1670 #endif
1674 * Height should be a multiple of 2 and width should be a multiple of 16
1675 * (If this is a problem for anyone then tell me, and I will fix it.)
1677 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678 long width, long height,
1679 long lumStride, long chromStride, long dstStride)
1681 //FIXME interpolate chroma
1682 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1686 * Width should be a multiple of 16.
1688 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1689 long width, long height,
1690 long lumStride, long chromStride, long dstStride)
1692 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1696 * Width should be a multiple of 16.
1698 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1699 long width, long height,
1700 long lumStride, long chromStride, long dstStride)
1702 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1706 * Height should be a multiple of 2 and width should be a multiple of 16.
1707 * (If this is a problem for anyone then tell me, and I will fix it.)
1709 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1710 long width, long height,
1711 long lumStride, long chromStride, long srcStride)
1713 long y;
1714 const x86_reg chromWidth= width>>1;
1715 for (y=0; y<height; y+=2) {
1716 #if HAVE_MMX
1717 __asm__ volatile(
1718 "xor %%"REG_a", %%"REG_a" \n\t"
1719 "pcmpeqw %%mm7, %%mm7 \n\t"
1720 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1721 ASMALIGN(4)
1722 "1: \n\t"
1723 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1724 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1725 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1726 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1727 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1728 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1729 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1730 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1731 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1732 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1733 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1735 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1737 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1738 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1739 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1740 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1741 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1742 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1743 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1744 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1745 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1746 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1748 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1750 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1751 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1752 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1753 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1754 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1755 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1756 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1757 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1759 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1760 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1762 "add $8, %%"REG_a" \n\t"
1763 "cmp %4, %%"REG_a" \n\t"
1764 " jb 1b \n\t"
1765 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1766 : "memory", "%"REG_a
1769 ydst += lumStride;
1770 src += srcStride;
1772 __asm__ volatile(
1773 "xor %%"REG_a", %%"REG_a" \n\t"
1774 ASMALIGN(4)
1775 "1: \n\t"
1776 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1777 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1778 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1779 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1780 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1781 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1782 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1783 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1784 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1785 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1786 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1788 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1789 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1791 "add $8, %%"REG_a" \n\t"
1792 "cmp %4, %%"REG_a" \n\t"
1793 " jb 1b \n\t"
1795 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1796 : "memory", "%"REG_a
1798 #else
1799 long i;
1800 for (i=0; i<chromWidth; i++) {
1801 ydst[2*i+0] = src[4*i+0];
1802 udst[i] = src[4*i+1];
1803 ydst[2*i+1] = src[4*i+2];
1804 vdst[i] = src[4*i+3];
1806 ydst += lumStride;
1807 src += srcStride;
1809 for (i=0; i<chromWidth; i++) {
1810 ydst[2*i+0] = src[4*i+0];
1811 ydst[2*i+1] = src[4*i+2];
1813 #endif
1814 udst += chromStride;
1815 vdst += chromStride;
1816 ydst += lumStride;
1817 src += srcStride;
1819 #if HAVE_MMX
1820 __asm__ volatile(EMMS" \n\t"
1821 SFENCE" \n\t"
1822 :::"memory");
1823 #endif
1826 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1827 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1828 long width, long height, long lumStride, long chromStride)
1830 /* Y Plane */
1831 memcpy(ydst, ysrc, width*height);
1833 /* XXX: implement upscaling for U,V */
1836 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1838 long x,y;
1840 dst[0]= src[0];
1842 // first line
1843 for (x=0; x<srcWidth-1; x++) {
1844 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1845 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1847 dst[2*srcWidth-1]= src[srcWidth-1];
1849 dst+= dstStride;
1851 for (y=1; y<srcHeight; y++) {
1852 #if HAVE_MMX2 || HAVE_AMD3DNOW
1853 const x86_reg mmxSize= srcWidth&~15;
1854 __asm__ volatile(
1855 "mov %4, %%"REG_a" \n\t"
1856 "1: \n\t"
1857 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1858 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1859 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1860 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1861 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1862 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1863 PAVGB" %%mm0, %%mm5 \n\t"
1864 PAVGB" %%mm0, %%mm3 \n\t"
1865 PAVGB" %%mm0, %%mm5 \n\t"
1866 PAVGB" %%mm0, %%mm3 \n\t"
1867 PAVGB" %%mm1, %%mm4 \n\t"
1868 PAVGB" %%mm1, %%mm2 \n\t"
1869 PAVGB" %%mm1, %%mm4 \n\t"
1870 PAVGB" %%mm1, %%mm2 \n\t"
1871 "movq %%mm5, %%mm7 \n\t"
1872 "movq %%mm4, %%mm6 \n\t"
1873 "punpcklbw %%mm3, %%mm5 \n\t"
1874 "punpckhbw %%mm3, %%mm7 \n\t"
1875 "punpcklbw %%mm2, %%mm4 \n\t"
1876 "punpckhbw %%mm2, %%mm6 \n\t"
1877 #if 1
1878 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1879 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1880 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1881 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1882 #else
1883 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1884 "movq %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1885 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1886 "movq %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1887 #endif
1888 "add $8, %%"REG_a" \n\t"
1889 " js 1b \n\t"
1890 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1891 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1892 "g" (-mmxSize)
1893 : "%"REG_a
1896 #else
1897 const x86_reg mmxSize=1;
1898 #endif
1899 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1900 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1902 for (x=mmxSize-1; x<srcWidth-1; x++) {
1903 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1904 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1905 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1906 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1908 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1909 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1911 dst+=dstStride*2;
1912 src+=srcStride;
1915 // last line
1916 #if 1
1917 dst[0]= src[0];
1919 for (x=0; x<srcWidth-1; x++) {
1920 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1921 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1923 dst[2*srcWidth-1]= src[srcWidth-1];
1924 #else
1925 for (x=0; x<srcWidth; x++) {
1926 dst[2*x+0]=
1927 dst[2*x+1]= src[x];
1929 #endif
1931 #if HAVE_MMX
1932 __asm__ volatile(EMMS" \n\t"
1933 SFENCE" \n\t"
1934 :::"memory");
1935 #endif
1939 * Height should be a multiple of 2 and width should be a multiple of 16.
1940 * (If this is a problem for anyone then tell me, and I will fix it.)
1941 * Chrominance data is only taken from every second line, others are ignored.
1942 * FIXME: Write HQ version.
1944 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1945 long width, long height,
1946 long lumStride, long chromStride, long srcStride)
1948 long y;
1949 const x86_reg chromWidth= width>>1;
1950 for (y=0; y<height; y+=2) {
1951 #if HAVE_MMX
1952 __asm__ volatile(
1953 "xor %%"REG_a", %%"REG_a" \n\t"
1954 "pcmpeqw %%mm7, %%mm7 \n\t"
1955 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1956 ASMALIGN(4)
1957 "1: \n\t"
1958 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1959 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1960 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1961 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1962 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1963 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1964 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1965 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1966 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1967 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1968 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1970 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1972 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1973 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1974 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1975 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1976 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1977 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1978 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1979 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1980 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1981 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1983 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1985 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1986 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1987 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1988 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1989 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1990 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1991 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1992 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1994 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1995 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1997 "add $8, %%"REG_a" \n\t"
1998 "cmp %4, %%"REG_a" \n\t"
1999 " jb 1b \n\t"
2000 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2001 : "memory", "%"REG_a
2004 ydst += lumStride;
2005 src += srcStride;
2007 __asm__ volatile(
2008 "xor %%"REG_a", %%"REG_a" \n\t"
2009 ASMALIGN(4)
2010 "1: \n\t"
2011 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
2012 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
2013 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
2014 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
2015 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
2016 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2017 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2018 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2019 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2020 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2021 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2023 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
2024 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
2026 "add $8, %%"REG_a" \n\t"
2027 "cmp %4, %%"REG_a" \n\t"
2028 " jb 1b \n\t"
2030 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2031 : "memory", "%"REG_a
2033 #else
2034 long i;
2035 for (i=0; i<chromWidth; i++) {
2036 udst[i] = src[4*i+0];
2037 ydst[2*i+0] = src[4*i+1];
2038 vdst[i] = src[4*i+2];
2039 ydst[2*i+1] = src[4*i+3];
2041 ydst += lumStride;
2042 src += srcStride;
2044 for (i=0; i<chromWidth; i++) {
2045 ydst[2*i+0] = src[4*i+1];
2046 ydst[2*i+1] = src[4*i+3];
2048 #endif
2049 udst += chromStride;
2050 vdst += chromStride;
2051 ydst += lumStride;
2052 src += srcStride;
2054 #if HAVE_MMX
2055 __asm__ volatile(EMMS" \n\t"
2056 SFENCE" \n\t"
2057 :::"memory");
2058 #endif
2062 * Height should be a multiple of 2 and width should be a multiple of 2.
2063 * (If this is a problem for anyone then tell me, and I will fix it.)
2064 * Chrominance data is only taken from every second line,
2065 * others are ignored in the C version.
2066 * FIXME: Write HQ version.
2068 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2069 long width, long height,
2070 long lumStride, long chromStride, long srcStride)
2072 long y;
2073 const x86_reg chromWidth= width>>1;
2074 #if HAVE_MMX
2075 for (y=0; y<height-2; y+=2) {
2076 long i;
2077 for (i=0; i<2; i++) {
2078 __asm__ volatile(
2079 "mov %2, %%"REG_a" \n\t"
2080 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
2081 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2082 "pxor %%mm7, %%mm7 \n\t"
2083 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2084 ASMALIGN(4)
2085 "1: \n\t"
2086 PREFETCH" 64(%0, %%"REG_d") \n\t"
2087 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2088 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2089 "punpcklbw %%mm7, %%mm0 \n\t"
2090 "punpcklbw %%mm7, %%mm1 \n\t"
2091 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2092 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2093 "punpcklbw %%mm7, %%mm2 \n\t"
2094 "punpcklbw %%mm7, %%mm3 \n\t"
2095 "pmaddwd %%mm6, %%mm0 \n\t"
2096 "pmaddwd %%mm6, %%mm1 \n\t"
2097 "pmaddwd %%mm6, %%mm2 \n\t"
2098 "pmaddwd %%mm6, %%mm3 \n\t"
2099 #ifndef FAST_BGR2YV12
2100 "psrad $8, %%mm0 \n\t"
2101 "psrad $8, %%mm1 \n\t"
2102 "psrad $8, %%mm2 \n\t"
2103 "psrad $8, %%mm3 \n\t"
2104 #endif
2105 "packssdw %%mm1, %%mm0 \n\t"
2106 "packssdw %%mm3, %%mm2 \n\t"
2107 "pmaddwd %%mm5, %%mm0 \n\t"
2108 "pmaddwd %%mm5, %%mm2 \n\t"
2109 "packssdw %%mm2, %%mm0 \n\t"
2110 "psraw $7, %%mm0 \n\t"
2112 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2113 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2114 "punpcklbw %%mm7, %%mm4 \n\t"
2115 "punpcklbw %%mm7, %%mm1 \n\t"
2116 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2117 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2118 "punpcklbw %%mm7, %%mm2 \n\t"
2119 "punpcklbw %%mm7, %%mm3 \n\t"
2120 "pmaddwd %%mm6, %%mm4 \n\t"
2121 "pmaddwd %%mm6, %%mm1 \n\t"
2122 "pmaddwd %%mm6, %%mm2 \n\t"
2123 "pmaddwd %%mm6, %%mm3 \n\t"
2124 #ifndef FAST_BGR2YV12
2125 "psrad $8, %%mm4 \n\t"
2126 "psrad $8, %%mm1 \n\t"
2127 "psrad $8, %%mm2 \n\t"
2128 "psrad $8, %%mm3 \n\t"
2129 #endif
2130 "packssdw %%mm1, %%mm4 \n\t"
2131 "packssdw %%mm3, %%mm2 \n\t"
2132 "pmaddwd %%mm5, %%mm4 \n\t"
2133 "pmaddwd %%mm5, %%mm2 \n\t"
2134 "add $24, %%"REG_d" \n\t"
2135 "packssdw %%mm2, %%mm4 \n\t"
2136 "psraw $7, %%mm4 \n\t"
2138 "packuswb %%mm4, %%mm0 \n\t"
2139 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
2141 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2142 "add $8, %%"REG_a" \n\t"
2143 " js 1b \n\t"
2144 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
2145 : "%"REG_a, "%"REG_d
2147 ydst += lumStride;
2148 src += srcStride;
2150 src -= srcStride*2;
2151 __asm__ volatile(
2152 "mov %4, %%"REG_a" \n\t"
2153 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2154 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
2155 "pxor %%mm7, %%mm7 \n\t"
2156 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
2157 "add %%"REG_d", %%"REG_d" \n\t"
2158 ASMALIGN(4)
2159 "1: \n\t"
2160 PREFETCH" 64(%0, %%"REG_d") \n\t"
2161 PREFETCH" 64(%1, %%"REG_d") \n\t"
2162 #if HAVE_MMX2 || HAVE_AMD3DNOW
2163 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2164 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2165 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2166 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2167 PAVGB" %%mm1, %%mm0 \n\t"
2168 PAVGB" %%mm3, %%mm2 \n\t"
2169 "movq %%mm0, %%mm1 \n\t"
2170 "movq %%mm2, %%mm3 \n\t"
2171 "psrlq $24, %%mm0 \n\t"
2172 "psrlq $24, %%mm2 \n\t"
2173 PAVGB" %%mm1, %%mm0 \n\t"
2174 PAVGB" %%mm3, %%mm2 \n\t"
2175 "punpcklbw %%mm7, %%mm0 \n\t"
2176 "punpcklbw %%mm7, %%mm2 \n\t"
2177 #else
2178 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2179 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2180 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2181 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2182 "punpcklbw %%mm7, %%mm0 \n\t"
2183 "punpcklbw %%mm7, %%mm1 \n\t"
2184 "punpcklbw %%mm7, %%mm2 \n\t"
2185 "punpcklbw %%mm7, %%mm3 \n\t"
2186 "paddw %%mm1, %%mm0 \n\t"
2187 "paddw %%mm3, %%mm2 \n\t"
2188 "paddw %%mm2, %%mm0 \n\t"
2189 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2190 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2191 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2192 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2193 "punpcklbw %%mm7, %%mm4 \n\t"
2194 "punpcklbw %%mm7, %%mm1 \n\t"
2195 "punpcklbw %%mm7, %%mm2 \n\t"
2196 "punpcklbw %%mm7, %%mm3 \n\t"
2197 "paddw %%mm1, %%mm4 \n\t"
2198 "paddw %%mm3, %%mm2 \n\t"
2199 "paddw %%mm4, %%mm2 \n\t"
2200 "psrlw $2, %%mm0 \n\t"
2201 "psrlw $2, %%mm2 \n\t"
2202 #endif
2203 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2204 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2206 "pmaddwd %%mm0, %%mm1 \n\t"
2207 "pmaddwd %%mm2, %%mm3 \n\t"
2208 "pmaddwd %%mm6, %%mm0 \n\t"
2209 "pmaddwd %%mm6, %%mm2 \n\t"
2210 #ifndef FAST_BGR2YV12
2211 "psrad $8, %%mm0 \n\t"
2212 "psrad $8, %%mm1 \n\t"
2213 "psrad $8, %%mm2 \n\t"
2214 "psrad $8, %%mm3 \n\t"
2215 #endif
2216 "packssdw %%mm2, %%mm0 \n\t"
2217 "packssdw %%mm3, %%mm1 \n\t"
2218 "pmaddwd %%mm5, %%mm0 \n\t"
2219 "pmaddwd %%mm5, %%mm1 \n\t"
2220 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2221 "psraw $7, %%mm0 \n\t"
2223 #if HAVE_MMX2 || HAVE_AMD3DNOW
2224 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2225 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2226 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2227 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2228 PAVGB" %%mm1, %%mm4 \n\t"
2229 PAVGB" %%mm3, %%mm2 \n\t"
2230 "movq %%mm4, %%mm1 \n\t"
2231 "movq %%mm2, %%mm3 \n\t"
2232 "psrlq $24, %%mm4 \n\t"
2233 "psrlq $24, %%mm2 \n\t"
2234 PAVGB" %%mm1, %%mm4 \n\t"
2235 PAVGB" %%mm3, %%mm2 \n\t"
2236 "punpcklbw %%mm7, %%mm4 \n\t"
2237 "punpcklbw %%mm7, %%mm2 \n\t"
2238 #else
2239 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2240 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2241 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2242 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2243 "punpcklbw %%mm7, %%mm4 \n\t"
2244 "punpcklbw %%mm7, %%mm1 \n\t"
2245 "punpcklbw %%mm7, %%mm2 \n\t"
2246 "punpcklbw %%mm7, %%mm3 \n\t"
2247 "paddw %%mm1, %%mm4 \n\t"
2248 "paddw %%mm3, %%mm2 \n\t"
2249 "paddw %%mm2, %%mm4 \n\t"
2250 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2251 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2252 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2253 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2254 "punpcklbw %%mm7, %%mm5 \n\t"
2255 "punpcklbw %%mm7, %%mm1 \n\t"
2256 "punpcklbw %%mm7, %%mm2 \n\t"
2257 "punpcklbw %%mm7, %%mm3 \n\t"
2258 "paddw %%mm1, %%mm5 \n\t"
2259 "paddw %%mm3, %%mm2 \n\t"
2260 "paddw %%mm5, %%mm2 \n\t"
2261 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
2262 "psrlw $2, %%mm4 \n\t"
2263 "psrlw $2, %%mm2 \n\t"
2264 #endif
2265 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
2266 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
2268 "pmaddwd %%mm4, %%mm1 \n\t"
2269 "pmaddwd %%mm2, %%mm3 \n\t"
2270 "pmaddwd %%mm6, %%mm4 \n\t"
2271 "pmaddwd %%mm6, %%mm2 \n\t"
2272 #ifndef FAST_BGR2YV12
2273 "psrad $8, %%mm4 \n\t"
2274 "psrad $8, %%mm1 \n\t"
2275 "psrad $8, %%mm2 \n\t"
2276 "psrad $8, %%mm3 \n\t"
2277 #endif
2278 "packssdw %%mm2, %%mm4 \n\t"
2279 "packssdw %%mm3, %%mm1 \n\t"
2280 "pmaddwd %%mm5, %%mm4 \n\t"
2281 "pmaddwd %%mm5, %%mm1 \n\t"
2282 "add $24, %%"REG_d" \n\t"
2283 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2284 "psraw $7, %%mm4 \n\t"
2286 "movq %%mm0, %%mm1 \n\t"
2287 "punpckldq %%mm4, %%mm0 \n\t"
2288 "punpckhdq %%mm4, %%mm1 \n\t"
2289 "packsswb %%mm1, %%mm0 \n\t"
2290 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
2291 "movd %%mm0, (%2, %%"REG_a") \n\t"
2292 "punpckhdq %%mm0, %%mm0 \n\t"
2293 "movd %%mm0, (%3, %%"REG_a") \n\t"
2294 "add $4, %%"REG_a" \n\t"
2295 " js 1b \n\t"
2296 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2297 : "%"REG_a, "%"REG_d
2300 udst += chromStride;
2301 vdst += chromStride;
2302 src += srcStride*2;
2305 __asm__ volatile(EMMS" \n\t"
2306 SFENCE" \n\t"
2307 :::"memory");
2308 #else
2309 y=0;
2310 #endif
2311 for (; y<height; y+=2) {
2312 long i;
2313 for (i=0; i<chromWidth; i++) {
2314 unsigned int b = src[6*i+0];
2315 unsigned int g = src[6*i+1];
2316 unsigned int r = src[6*i+2];
2318 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2322 udst[i] = U;
2323 vdst[i] = V;
2324 ydst[2*i] = Y;
2326 b = src[6*i+3];
2327 g = src[6*i+4];
2328 r = src[6*i+5];
2330 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2331 ydst[2*i+1] = Y;
2333 ydst += lumStride;
2334 src += srcStride;
2336 for (i=0; i<chromWidth; i++) {
2337 unsigned int b = src[6*i+0];
2338 unsigned int g = src[6*i+1];
2339 unsigned int r = src[6*i+2];
2341 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2343 ydst[2*i] = Y;
2345 b = src[6*i+3];
2346 g = src[6*i+4];
2347 r = src[6*i+5];
2349 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350 ydst[2*i+1] = Y;
2352 udst += chromStride;
2353 vdst += chromStride;
2354 ydst += lumStride;
2355 src += srcStride;
2359 static void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2360 long width, long height, long src1Stride,
2361 long src2Stride, long dstStride)
2363 long h;
2365 for (h=0; h < height; h++) {
2366 long w;
2368 #if HAVE_MMX
2369 #if HAVE_SSE2
2370 __asm__(
2371 "xor %%"REG_a", %%"REG_a" \n\t"
2372 "1: \n\t"
2373 PREFETCH" 64(%1, %%"REG_a") \n\t"
2374 PREFETCH" 64(%2, %%"REG_a") \n\t"
2375 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2376 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2377 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2378 "punpcklbw %%xmm2, %%xmm0 \n\t"
2379 "punpckhbw %%xmm2, %%xmm1 \n\t"
2380 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
2381 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
2382 "add $16, %%"REG_a" \n\t"
2383 "cmp %3, %%"REG_a" \n\t"
2384 " jb 1b \n\t"
2385 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2386 : "memory", "%"REG_a""
2388 #else
2389 __asm__(
2390 "xor %%"REG_a", %%"REG_a" \n\t"
2391 "1: \n\t"
2392 PREFETCH" 64(%1, %%"REG_a") \n\t"
2393 PREFETCH" 64(%2, %%"REG_a") \n\t"
2394 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2395 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2396 "movq %%mm0, %%mm1 \n\t"
2397 "movq %%mm2, %%mm3 \n\t"
2398 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2399 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2400 "punpcklbw %%mm4, %%mm0 \n\t"
2401 "punpckhbw %%mm4, %%mm1 \n\t"
2402 "punpcklbw %%mm5, %%mm2 \n\t"
2403 "punpckhbw %%mm5, %%mm3 \n\t"
2404 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
2405 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
2406 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
2407 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
2408 "add $16, %%"REG_a" \n\t"
2409 "cmp %3, %%"REG_a" \n\t"
2410 " jb 1b \n\t"
2411 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2412 : "memory", "%"REG_a
2414 #endif
2415 for (w= (width&(~15)); w < width; w++) {
2416 dest[2*w+0] = src1[w];
2417 dest[2*w+1] = src2[w];
2419 #else
2420 for (w=0; w < width; w++) {
2421 dest[2*w+0] = src1[w];
2422 dest[2*w+1] = src2[w];
2424 #endif
2425 dest += dstStride;
2426 src1 += src1Stride;
2427 src2 += src2Stride;
2429 #if HAVE_MMX
2430 __asm__(
2431 EMMS" \n\t"
2432 SFENCE" \n\t"
2433 ::: "memory"
2435 #endif
2438 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2439 uint8_t *dst1, uint8_t *dst2,
2440 long width, long height,
2441 long srcStride1, long srcStride2,
2442 long dstStride1, long dstStride2)
2444 x86_reg y;
2445 long x,w,h;
2446 w=width/2; h=height/2;
2447 #if HAVE_MMX
2448 __asm__ volatile(
2449 PREFETCH" %0 \n\t"
2450 PREFETCH" %1 \n\t"
2451 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2452 #endif
2453 for (y=0;y<h;y++) {
2454 const uint8_t* s1=src1+srcStride1*(y>>1);
2455 uint8_t* d=dst1+dstStride1*y;
2456 x=0;
2457 #if HAVE_MMX
2458 for (;x<w-31;x+=32) {
2459 __asm__ volatile(
2460 PREFETCH" 32%1 \n\t"
2461 "movq %1, %%mm0 \n\t"
2462 "movq 8%1, %%mm2 \n\t"
2463 "movq 16%1, %%mm4 \n\t"
2464 "movq 24%1, %%mm6 \n\t"
2465 "movq %%mm0, %%mm1 \n\t"
2466 "movq %%mm2, %%mm3 \n\t"
2467 "movq %%mm4, %%mm5 \n\t"
2468 "movq %%mm6, %%mm7 \n\t"
2469 "punpcklbw %%mm0, %%mm0 \n\t"
2470 "punpckhbw %%mm1, %%mm1 \n\t"
2471 "punpcklbw %%mm2, %%mm2 \n\t"
2472 "punpckhbw %%mm3, %%mm3 \n\t"
2473 "punpcklbw %%mm4, %%mm4 \n\t"
2474 "punpckhbw %%mm5, %%mm5 \n\t"
2475 "punpcklbw %%mm6, %%mm6 \n\t"
2476 "punpckhbw %%mm7, %%mm7 \n\t"
2477 MOVNTQ" %%mm0, %0 \n\t"
2478 MOVNTQ" %%mm1, 8%0 \n\t"
2479 MOVNTQ" %%mm2, 16%0 \n\t"
2480 MOVNTQ" %%mm3, 24%0 \n\t"
2481 MOVNTQ" %%mm4, 32%0 \n\t"
2482 MOVNTQ" %%mm5, 40%0 \n\t"
2483 MOVNTQ" %%mm6, 48%0 \n\t"
2484 MOVNTQ" %%mm7, 56%0"
2485 :"=m"(d[2*x])
2486 :"m"(s1[x])
2487 :"memory");
2489 #endif
2490 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2492 for (y=0;y<h;y++) {
2493 const uint8_t* s2=src2+srcStride2*(y>>1);
2494 uint8_t* d=dst2+dstStride2*y;
2495 x=0;
2496 #if HAVE_MMX
2497 for (;x<w-31;x+=32) {
2498 __asm__ volatile(
2499 PREFETCH" 32%1 \n\t"
2500 "movq %1, %%mm0 \n\t"
2501 "movq 8%1, %%mm2 \n\t"
2502 "movq 16%1, %%mm4 \n\t"
2503 "movq 24%1, %%mm6 \n\t"
2504 "movq %%mm0, %%mm1 \n\t"
2505 "movq %%mm2, %%mm3 \n\t"
2506 "movq %%mm4, %%mm5 \n\t"
2507 "movq %%mm6, %%mm7 \n\t"
2508 "punpcklbw %%mm0, %%mm0 \n\t"
2509 "punpckhbw %%mm1, %%mm1 \n\t"
2510 "punpcklbw %%mm2, %%mm2 \n\t"
2511 "punpckhbw %%mm3, %%mm3 \n\t"
2512 "punpcklbw %%mm4, %%mm4 \n\t"
2513 "punpckhbw %%mm5, %%mm5 \n\t"
2514 "punpcklbw %%mm6, %%mm6 \n\t"
2515 "punpckhbw %%mm7, %%mm7 \n\t"
2516 MOVNTQ" %%mm0, %0 \n\t"
2517 MOVNTQ" %%mm1, 8%0 \n\t"
2518 MOVNTQ" %%mm2, 16%0 \n\t"
2519 MOVNTQ" %%mm3, 24%0 \n\t"
2520 MOVNTQ" %%mm4, 32%0 \n\t"
2521 MOVNTQ" %%mm5, 40%0 \n\t"
2522 MOVNTQ" %%mm6, 48%0 \n\t"
2523 MOVNTQ" %%mm7, 56%0"
2524 :"=m"(d[2*x])
2525 :"m"(s2[x])
2526 :"memory");
2528 #endif
2529 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2531 #if HAVE_MMX
2532 __asm__(
2533 EMMS" \n\t"
2534 SFENCE" \n\t"
2535 ::: "memory"
2537 #endif
2540 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2541 uint8_t *dst,
2542 long width, long height,
2543 long srcStride1, long srcStride2,
2544 long srcStride3, long dstStride)
2546 x86_reg x;
2547 long y,w,h;
2548 w=width/2; h=height;
2549 for (y=0;y<h;y++) {
2550 const uint8_t* yp=src1+srcStride1*y;
2551 const uint8_t* up=src2+srcStride2*(y>>2);
2552 const uint8_t* vp=src3+srcStride3*(y>>2);
2553 uint8_t* d=dst+dstStride*y;
2554 x=0;
2555 #if HAVE_MMX
2556 for (;x<w-7;x+=8) {
2557 __asm__ volatile(
2558 PREFETCH" 32(%1, %0) \n\t"
2559 PREFETCH" 32(%2, %0) \n\t"
2560 PREFETCH" 32(%3, %0) \n\t"
2561 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2562 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2563 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2564 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2565 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2566 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2567 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2568 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2569 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2570 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2572 "movq %%mm1, %%mm6 \n\t"
2573 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2574 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2575 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2576 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2577 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2579 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2580 "movq 8(%1, %0, 4), %%mm0 \n\t"
2581 "movq %%mm0, %%mm3 \n\t"
2582 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2583 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2584 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2585 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2587 "movq %%mm4, %%mm6 \n\t"
2588 "movq 16(%1, %0, 4), %%mm0 \n\t"
2589 "movq %%mm0, %%mm3 \n\t"
2590 "punpcklbw %%mm5, %%mm4 \n\t"
2591 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2592 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2593 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2594 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2596 "punpckhbw %%mm5, %%mm6 \n\t"
2597 "movq 24(%1, %0, 4), %%mm0 \n\t"
2598 "movq %%mm0, %%mm3 \n\t"
2599 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2600 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2601 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2602 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2604 : "+r" (x)
2605 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2606 :"memory");
2608 #endif
2609 for (; x<w; x++) {
2610 const long x2 = x<<2;
2611 d[8*x+0] = yp[x2];
2612 d[8*x+1] = up[x];
2613 d[8*x+2] = yp[x2+1];
2614 d[8*x+3] = vp[x];
2615 d[8*x+4] = yp[x2+2];
2616 d[8*x+5] = up[x];
2617 d[8*x+6] = yp[x2+3];
2618 d[8*x+7] = vp[x];
2621 #if HAVE_MMX
2622 __asm__(
2623 EMMS" \n\t"
2624 SFENCE" \n\t"
2625 ::: "memory"
2627 #endif
2630 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2632 dst += count;
2633 src += 2*count;
2634 count= - count;
2636 #if HAVE_MMX
2637 if(count <= -16) {
2638 count += 15;
2639 __asm__ volatile(
2640 "pcmpeqw %%mm7, %%mm7 \n\t"
2641 "psrlw $8, %%mm7 \n\t"
2642 "1: \n\t"
2643 "movq -30(%1, %0, 2), %%mm0 \n\t"
2644 "movq -22(%1, %0, 2), %%mm1 \n\t"
2645 "movq -14(%1, %0, 2), %%mm2 \n\t"
2646 "movq -6(%1, %0, 2), %%mm3 \n\t"
2647 "pand %%mm7, %%mm0 \n\t"
2648 "pand %%mm7, %%mm1 \n\t"
2649 "pand %%mm7, %%mm2 \n\t"
2650 "pand %%mm7, %%mm3 \n\t"
2651 "packuswb %%mm1, %%mm0 \n\t"
2652 "packuswb %%mm3, %%mm2 \n\t"
2653 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2654 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2655 "add $16, %0 \n\t"
2656 " js 1b \n\t"
2657 : "+r"(count)
2658 : "r"(src), "r"(dst)
2660 count -= 15;
2662 #endif
2663 while(count<0) {
2664 dst[count]= src[2*count];
2665 count++;
2669 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2671 dst0+= count;
2672 dst1+= count;
2673 src += 4*count;
2674 count= - count;
2675 #if HAVE_MMX
2676 if(count <= -8) {
2677 count += 7;
2678 __asm__ volatile(
2679 "pcmpeqw %%mm7, %%mm7 \n\t"
2680 "psrlw $8, %%mm7 \n\t"
2681 "1: \n\t"
2682 "movq -28(%1, %0, 4), %%mm0 \n\t"
2683 "movq -20(%1, %0, 4), %%mm1 \n\t"
2684 "movq -12(%1, %0, 4), %%mm2 \n\t"
2685 "movq -4(%1, %0, 4), %%mm3 \n\t"
2686 "pand %%mm7, %%mm0 \n\t"
2687 "pand %%mm7, %%mm1 \n\t"
2688 "pand %%mm7, %%mm2 \n\t"
2689 "pand %%mm7, %%mm3 \n\t"
2690 "packuswb %%mm1, %%mm0 \n\t"
2691 "packuswb %%mm3, %%mm2 \n\t"
2692 "movq %%mm0, %%mm1 \n\t"
2693 "movq %%mm2, %%mm3 \n\t"
2694 "psrlw $8, %%mm0 \n\t"
2695 "psrlw $8, %%mm2 \n\t"
2696 "pand %%mm7, %%mm1 \n\t"
2697 "pand %%mm7, %%mm3 \n\t"
2698 "packuswb %%mm2, %%mm0 \n\t"
2699 "packuswb %%mm3, %%mm1 \n\t"
2700 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2701 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2702 "add $8, %0 \n\t"
2703 " js 1b \n\t"
2704 : "+r"(count)
2705 : "r"(src), "r"(dst0), "r"(dst1)
2707 count -= 7;
2709 #endif
2710 while(count<0) {
2711 dst0[count]= src[4*count+0];
2712 dst1[count]= src[4*count+2];
2713 count++;
2717 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2719 dst0 += count;
2720 dst1 += count;
2721 src0 += 4*count;
2722 src1 += 4*count;
2723 count= - count;
2724 #ifdef PAVGB
2725 if(count <= -8) {
2726 count += 7;
2727 __asm__ volatile(
2728 "pcmpeqw %%mm7, %%mm7 \n\t"
2729 "psrlw $8, %%mm7 \n\t"
2730 "1: \n\t"
2731 "movq -28(%1, %0, 4), %%mm0 \n\t"
2732 "movq -20(%1, %0, 4), %%mm1 \n\t"
2733 "movq -12(%1, %0, 4), %%mm2 \n\t"
2734 "movq -4(%1, %0, 4), %%mm3 \n\t"
2735 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2736 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2737 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2738 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2739 "pand %%mm7, %%mm0 \n\t"
2740 "pand %%mm7, %%mm1 \n\t"
2741 "pand %%mm7, %%mm2 \n\t"
2742 "pand %%mm7, %%mm3 \n\t"
2743 "packuswb %%mm1, %%mm0 \n\t"
2744 "packuswb %%mm3, %%mm2 \n\t"
2745 "movq %%mm0, %%mm1 \n\t"
2746 "movq %%mm2, %%mm3 \n\t"
2747 "psrlw $8, %%mm0 \n\t"
2748 "psrlw $8, %%mm2 \n\t"
2749 "pand %%mm7, %%mm1 \n\t"
2750 "pand %%mm7, %%mm3 \n\t"
2751 "packuswb %%mm2, %%mm0 \n\t"
2752 "packuswb %%mm3, %%mm1 \n\t"
2753 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2754 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2755 "add $8, %0 \n\t"
2756 " js 1b \n\t"
2757 : "+r"(count)
2758 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2760 count -= 7;
2762 #endif
2763 while(count<0) {
2764 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2765 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2766 count++;
2770 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2772 dst0+= count;
2773 dst1+= count;
2774 src += 4*count;
2775 count= - count;
2776 #if HAVE_MMX
2777 if(count <= -8) {
2778 count += 7;
2779 __asm__ volatile(
2780 "pcmpeqw %%mm7, %%mm7 \n\t"
2781 "psrlw $8, %%mm7 \n\t"
2782 "1: \n\t"
2783 "movq -28(%1, %0, 4), %%mm0 \n\t"
2784 "movq -20(%1, %0, 4), %%mm1 \n\t"
2785 "movq -12(%1, %0, 4), %%mm2 \n\t"
2786 "movq -4(%1, %0, 4), %%mm3 \n\t"
2787 "psrlw $8, %%mm0 \n\t"
2788 "psrlw $8, %%mm1 \n\t"
2789 "psrlw $8, %%mm2 \n\t"
2790 "psrlw $8, %%mm3 \n\t"
2791 "packuswb %%mm1, %%mm0 \n\t"
2792 "packuswb %%mm3, %%mm2 \n\t"
2793 "movq %%mm0, %%mm1 \n\t"
2794 "movq %%mm2, %%mm3 \n\t"
2795 "psrlw $8, %%mm0 \n\t"
2796 "psrlw $8, %%mm2 \n\t"
2797 "pand %%mm7, %%mm1 \n\t"
2798 "pand %%mm7, %%mm3 \n\t"
2799 "packuswb %%mm2, %%mm0 \n\t"
2800 "packuswb %%mm3, %%mm1 \n\t"
2801 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2802 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2803 "add $8, %0 \n\t"
2804 " js 1b \n\t"
2805 : "+r"(count)
2806 : "r"(src), "r"(dst0), "r"(dst1)
2808 count -= 7;
2810 #endif
2811 src++;
2812 while(count<0) {
2813 dst0[count]= src[4*count+0];
2814 dst1[count]= src[4*count+2];
2815 count++;
2819 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2821 dst0 += count;
2822 dst1 += count;
2823 src0 += 4*count;
2824 src1 += 4*count;
2825 count= - count;
2826 #ifdef PAVGB
2827 if(count <= -8) {
2828 count += 7;
2829 __asm__ volatile(
2830 "pcmpeqw %%mm7, %%mm7 \n\t"
2831 "psrlw $8, %%mm7 \n\t"
2832 "1: \n\t"
2833 "movq -28(%1, %0, 4), %%mm0 \n\t"
2834 "movq -20(%1, %0, 4), %%mm1 \n\t"
2835 "movq -12(%1, %0, 4), %%mm2 \n\t"
2836 "movq -4(%1, %0, 4), %%mm3 \n\t"
2837 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2838 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2839 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2840 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2841 "psrlw $8, %%mm0 \n\t"
2842 "psrlw $8, %%mm1 \n\t"
2843 "psrlw $8, %%mm2 \n\t"
2844 "psrlw $8, %%mm3 \n\t"
2845 "packuswb %%mm1, %%mm0 \n\t"
2846 "packuswb %%mm3, %%mm2 \n\t"
2847 "movq %%mm0, %%mm1 \n\t"
2848 "movq %%mm2, %%mm3 \n\t"
2849 "psrlw $8, %%mm0 \n\t"
2850 "psrlw $8, %%mm2 \n\t"
2851 "pand %%mm7, %%mm1 \n\t"
2852 "pand %%mm7, %%mm3 \n\t"
2853 "packuswb %%mm2, %%mm0 \n\t"
2854 "packuswb %%mm3, %%mm1 \n\t"
2855 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2856 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2857 "add $8, %0 \n\t"
2858 " js 1b \n\t"
2859 : "+r"(count)
2860 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2862 count -= 7;
2864 #endif
2865 src0++;
2866 src1++;
2867 while(count<0) {
2868 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2869 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2870 count++;
2874 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2875 long width, long height,
2876 long lumStride, long chromStride, long srcStride)
2878 long y;
2879 const long chromWidth= -((-width)>>1);
2881 for (y=0; y<height; y++) {
2882 RENAME(extract_even)(src, ydst, width);
2883 if(y&1) {
2884 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2885 udst+= chromStride;
2886 vdst+= chromStride;
2889 src += srcStride;
2890 ydst+= lumStride;
2892 #if HAVE_MMX
2893 __asm__(
2894 EMMS" \n\t"
2895 SFENCE" \n\t"
2896 ::: "memory"
2898 #endif
2901 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2902 long width, long height,
2903 long lumStride, long chromStride, long srcStride)
2905 long y;
2906 const long chromWidth= -((-width)>>1);
2908 for (y=0; y<height; y++) {
2909 RENAME(extract_even)(src, ydst, width);
2910 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2912 src += srcStride;
2913 ydst+= lumStride;
2914 udst+= chromStride;
2915 vdst+= chromStride;
2917 #if HAVE_MMX
2918 __asm__(
2919 EMMS" \n\t"
2920 SFENCE" \n\t"
2921 ::: "memory"
2923 #endif
2926 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2927 long width, long height,
2928 long lumStride, long chromStride, long srcStride)
2930 long y;
2931 const long chromWidth= -((-width)>>1);
2933 for (y=0; y<height; y++) {
2934 RENAME(extract_even)(src+1, ydst, width);
2935 if(y&1) {
2936 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2937 udst+= chromStride;
2938 vdst+= chromStride;
2941 src += srcStride;
2942 ydst+= lumStride;
2944 #if HAVE_MMX
2945 __asm__(
2946 EMMS" \n\t"
2947 SFENCE" \n\t"
2948 ::: "memory"
2950 #endif
2953 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2954 long width, long height,
2955 long lumStride, long chromStride, long srcStride)
2957 long y;
2958 const long chromWidth= -((-width)>>1);
2960 for (y=0; y<height; y++) {
2961 RENAME(extract_even)(src+1, ydst, width);
2962 RENAME(extract_even2)(src, udst, vdst, chromWidth);
2964 src += srcStride;
2965 ydst+= lumStride;
2966 udst+= chromStride;
2967 vdst+= chromStride;
2969 #if HAVE_MMX
2970 __asm__(
2971 EMMS" \n\t"
2972 SFENCE" \n\t"
2973 ::: "memory"
2975 #endif
2978 static inline void RENAME(rgb2rgb_init)(void)
2980 rgb15to16 = RENAME(rgb15to16);
2981 rgb15tobgr24 = RENAME(rgb15tobgr24);
2982 rgb15to32 = RENAME(rgb15to32);
2983 rgb16tobgr24 = RENAME(rgb16tobgr24);
2984 rgb16to32 = RENAME(rgb16to32);
2985 rgb16to15 = RENAME(rgb16to15);
2986 rgb24tobgr16 = RENAME(rgb24tobgr16);
2987 rgb24tobgr15 = RENAME(rgb24tobgr15);
2988 rgb24tobgr32 = RENAME(rgb24tobgr32);
2989 rgb32to16 = RENAME(rgb32to16);
2990 rgb32to15 = RENAME(rgb32to15);
2991 rgb32tobgr24 = RENAME(rgb32tobgr24);
2992 rgb24to15 = RENAME(rgb24to15);
2993 rgb24to16 = RENAME(rgb24to16);
2994 rgb24tobgr24 = RENAME(rgb24tobgr24);
2995 rgb32tobgr32 = RENAME(rgb32tobgr32);
2996 rgb32tobgr16 = RENAME(rgb32tobgr16);
2997 rgb32tobgr15 = RENAME(rgb32tobgr15);
2998 yv12toyuy2 = RENAME(yv12toyuy2);
2999 yv12touyvy = RENAME(yv12touyvy);
3000 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
3001 yuv422ptouyvy = RENAME(yuv422ptouyvy);
3002 yuy2toyv12 = RENAME(yuy2toyv12);
3003 // yvu9toyv12 = RENAME(yvu9toyv12);
3004 planar2x = RENAME(planar2x);
3005 rgb24toyv12 = RENAME(rgb24toyv12);
3006 interleaveBytes = RENAME(interleaveBytes);
3007 vu9_to_vu12 = RENAME(vu9_to_vu12);
3008 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
3010 uyvytoyuv420 = RENAME(uyvytoyuv420);
3011 uyvytoyuv422 = RENAME(uyvytoyuv422);
3012 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
3013 yuyvtoyuv422 = RENAME(yuyvtoyuv422);