Typos
[vlc/asuraparaju-public.git] / modules / video_chroma / i420_yuy2.c
blob57a7af6de059360a458a136885b2d805da2ebf01
1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
26 * Preamble
27 *****************************************************************************/
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
38 # include <altivec.h>
39 #endif
41 #include "i420_yuy2.h"
43 #define SRC_FOURCC "I420,IYUV,YV12"
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
53 #endif
55 /*****************************************************************************
56 * Local and extern prototypes.
57 *****************************************************************************/
58 static int Activate ( vlc_object_t * );
60 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
61 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
62 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
63 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
64 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
65 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
67 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
68 static void I420_cyuv ( filter_t *, picture_t *, picture_t * );
69 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
70 static picture_t *I420_cyuv_Filter ( filter_t *, picture_t * );
71 #endif
72 #if defined (MODULE_NAME_IS_i420_yuy2)
73 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
75 #endif
77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
78 /* Initialize MMX-specific constants */
79 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
80 static const uint64_t i_80w = 0x0000000080808080ULL;
81 #endif
83 /*****************************************************************************
84 * Module descriptor.
85 *****************************************************************************/
86 vlc_module_begin ()
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
89 set_capability( "video filter2", 80 )
90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
91 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
92 set_capability( "video filter2", 160 )
93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
94 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
95 set_capability( "video filter2", 250 )
96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
97 set_description(
98 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
99 set_capability( "video filter2", 250 )
100 #endif
101 set_callbacks( Activate, NULL )
102 vlc_module_end ()
104 /*****************************************************************************
105 * Activate: allocate a chroma function
106 *****************************************************************************
107 * This function allocates and initializes a chroma function
108 *****************************************************************************/
109 static int Activate( vlc_object_t *p_this )
111 filter_t *p_filter = (filter_t *)p_this;
113 if( p_filter->fmt_in.video.i_width & 1
114 || p_filter->fmt_in.video.i_height & 1 )
116 return -1;
119 if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
120 || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
121 return -1;
123 switch( p_filter->fmt_in.video.i_chroma )
125 case VLC_CODEC_YV12:
126 case VLC_CODEC_I420:
127 switch( p_filter->fmt_out.video.i_chroma )
129 case VLC_CODEC_YUYV:
130 p_filter->pf_video_filter = I420_YUY2_Filter;
131 break;
133 case VLC_CODEC_YVYU:
134 p_filter->pf_video_filter = I420_YVYU_Filter;
135 break;
137 case VLC_CODEC_UYVY:
138 p_filter->pf_video_filter = I420_UYVY_Filter;
139 break;
140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
141 case VLC_FOURCC('I','U','Y','V'):
142 p_filter->pf_video_filter = I420_IUYV_Filter;
143 break;
145 case VLC_CODEC_CYUV:
146 p_filter->pf_video_filter = I420_cyuv_Filter;
147 break;
148 #endif
150 #if defined (MODULE_NAME_IS_i420_yuy2)
151 case VLC_CODEC_Y211:
152 p_filter->pf_video_filter = I420_Y211_Filter;
153 break;
154 #endif
156 default:
157 return -1;
159 break;
161 default:
162 return -1;
165 return 0;
168 #if 0
169 static inline unsigned long long read_cycles(void)
171 unsigned long long v;
172 __asm__ __volatile__("rdtsc" : "=A" (v): );
174 return v;
176 #endif
178 /* Following functions are local */
180 VIDEO_FILTER_WRAPPER( I420_YUY2 )
181 VIDEO_FILTER_WRAPPER( I420_YVYU )
182 VIDEO_FILTER_WRAPPER( I420_UYVY )
183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 VIDEO_FILTER_WRAPPER( I420_IUYV )
185 VIDEO_FILTER_WRAPPER( I420_cyuv )
186 #endif
187 #if defined (MODULE_NAME_IS_i420_yuy2)
188 VIDEO_FILTER_WRAPPER( I420_Y211 )
189 #endif
191 /*****************************************************************************
192 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
193 *****************************************************************************/
194 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
195 picture_t *p_dest )
197 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
198 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
199 uint8_t *p_u = p_source->U_PIXELS;
200 uint8_t *p_v = p_source->V_PIXELS;
202 int i_x, i_y;
204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
205 #define VEC_NEXT_LINES( ) \
206 p_line1 = p_line2; \
207 p_line2 += p_dest->p->i_pitch; \
208 p_y1 = p_y2; \
209 p_y2 += p_source->p[Y_PLANE].i_pitch;
211 #define VEC_LOAD_UV( ) \
212 u_vec = vec_ld( 0, p_u ); p_u += 16; \
213 v_vec = vec_ld( 0, p_v ); p_v += 16;
215 #define VEC_MERGE( a ) \
216 uv_vec = a( u_vec, v_vec ); \
217 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
218 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
219 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
220 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
221 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
222 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
224 vector unsigned char u_vec;
225 vector unsigned char v_vec;
226 vector unsigned char uv_vec;
227 vector unsigned char y_vec;
229 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
230 ( p_filter->fmt_in.video.i_height % 2 ) ) )
232 /* Width is a multiple of 32, we take 2 lines at a time */
233 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
235 VEC_NEXT_LINES( );
236 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
238 VEC_LOAD_UV( );
239 VEC_MERGE( vec_mergeh );
240 VEC_MERGE( vec_mergel );
244 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
245 ( p_filter->fmt_in.video.i_height % 4 ) ) )
247 /* Width is only a multiple of 16, we take 4 lines at a time */
248 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
250 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
251 VEC_NEXT_LINES( );
252 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
254 VEC_LOAD_UV( );
255 VEC_MERGE( vec_mergeh );
256 VEC_MERGE( vec_mergel );
259 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
260 VEC_LOAD_UV( );
261 VEC_MERGE( vec_mergeh );
263 /* Line 3 and 4, pixels 0 to 16 */
264 VEC_NEXT_LINES( );
265 VEC_MERGE( vec_mergel );
267 /* Line 3 and 4, pixels 16 to ( width ) */
268 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
270 VEC_LOAD_UV( );
271 VEC_MERGE( vec_mergeh );
272 VEC_MERGE( vec_mergel );
276 else
278 /* Crap, use the C version */
279 #undef VEC_NEXT_LINES
280 #undef VEC_LOAD_UV
281 #undef VEC_MERGE
282 #endif
284 const int i_source_margin = p_source->p[0].i_pitch
285 - p_source->p[0].i_visible_pitch;
286 const int i_source_margin_c = p_source->p[1].i_pitch
287 - p_source->p[1].i_visible_pitch;
288 const int i_dest_margin = p_dest->p->i_pitch
289 - p_dest->p->i_visible_pitch;
291 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
292 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
294 p_line1 = p_line2;
295 p_line2 += p_dest->p->i_pitch;
297 p_y1 = p_y2;
298 p_y2 += p_source->p[Y_PLANE].i_pitch;
300 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
301 for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
303 C_YUV420_YUYV( );
304 C_YUV420_YUYV( );
305 C_YUV420_YUYV( );
306 C_YUV420_YUYV( );
308 #else
309 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
311 MMX_CALL( MMX_YUV420_YUYV );
313 #endif
314 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
316 C_YUV420_YUYV( );
319 p_y1 += i_source_margin;
320 p_y2 += i_source_margin;
321 p_u += i_source_margin_c;
322 p_v += i_source_margin_c;
323 p_line1 += i_dest_margin;
324 p_line2 += i_dest_margin;
327 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
328 /* re-enable FPU registers */
329 MMX_END;
330 #endif
332 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
334 #endif
336 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
338 ** SSE2 128 bits fetch/store instructions are faster
339 ** if memory access is 16 bytes aligned
342 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
343 ((intptr_t)p_line2|(intptr_t)p_y2))) )
345 /* use faster SSE2 aligned fetch and store */
346 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
348 p_line1 = p_line2;
349 p_line2 += p_dest->p->i_pitch;
351 p_y1 = p_y2;
352 p_y2 += p_source->p[Y_PLANE].i_pitch;
354 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
356 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
358 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
360 C_YUV420_YUYV( );
363 p_y1 += i_source_margin;
364 p_y2 += i_source_margin;
365 p_u += i_source_margin_c;
366 p_v += i_source_margin_c;
367 p_line1 += i_dest_margin;
368 p_line2 += i_dest_margin;
371 else
373 /* use slower SSE2 unaligned fetch and store */
374 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
376 p_line1 = p_line2;
377 p_line2 += p_dest->p->i_pitch;
379 p_y1 = p_y2;
380 p_y2 += p_source->p[Y_PLANE].i_pitch;
382 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
384 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
386 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
388 C_YUV420_YUYV( );
391 p_y1 += i_source_margin;
392 p_y2 += i_source_margin;
393 p_u += i_source_margin_c;
394 p_v += i_source_margin_c;
395 p_line1 += i_dest_margin;
396 p_line2 += i_dest_margin;
399 /* make sure all SSE2 stores are visible thereafter */
400 SSE2_END;
402 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
405 /*****************************************************************************
406 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
407 *****************************************************************************/
408 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
409 picture_t *p_dest )
411 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
412 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
413 uint8_t *p_u = p_source->U_PIXELS;
414 uint8_t *p_v = p_source->V_PIXELS;
416 int i_x, i_y;
418 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
419 #define VEC_NEXT_LINES( ) \
420 p_line1 = p_line2; \
421 p_line2 += p_dest->p->i_pitch; \
422 p_y1 = p_y2; \
423 p_y2 += p_source->p[Y_PLANE].i_pitch;
425 #define VEC_LOAD_UV( ) \
426 u_vec = vec_ld( 0, p_u ); p_u += 16; \
427 v_vec = vec_ld( 0, p_v ); p_v += 16;
429 #define VEC_MERGE( a ) \
430 vu_vec = a( v_vec, u_vec ); \
431 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
432 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
433 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
434 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
435 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
436 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
438 vector unsigned char u_vec;
439 vector unsigned char v_vec;
440 vector unsigned char vu_vec;
441 vector unsigned char y_vec;
443 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
444 ( p_filter->fmt_in.video.i_height % 2 ) ) )
446 /* Width is a multiple of 32, we take 2 lines at a time */
447 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
449 VEC_NEXT_LINES( );
450 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
452 VEC_LOAD_UV( );
453 VEC_MERGE( vec_mergeh );
454 VEC_MERGE( vec_mergel );
458 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
459 ( p_filter->fmt_in.video.i_height % 4 ) ) )
461 /* Width is only a multiple of 16, we take 4 lines at a time */
462 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
464 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
465 VEC_NEXT_LINES( );
466 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
468 VEC_LOAD_UV( );
469 VEC_MERGE( vec_mergeh );
470 VEC_MERGE( vec_mergel );
473 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
474 VEC_LOAD_UV( );
475 VEC_MERGE( vec_mergeh );
477 /* Line 3 and 4, pixels 0 to 16 */
478 VEC_NEXT_LINES( );
479 VEC_MERGE( vec_mergel );
481 /* Line 3 and 4, pixels 16 to ( width ) */
482 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
484 VEC_LOAD_UV( );
485 VEC_MERGE( vec_mergeh );
486 VEC_MERGE( vec_mergel );
490 else
492 /* Crap, use the C version */
493 #undef VEC_NEXT_LINES
494 #undef VEC_LOAD_UV
495 #undef VEC_MERGE
496 #endif
498 const int i_source_margin = p_source->p[0].i_pitch
499 - p_source->p[0].i_visible_pitch;
500 const int i_source_margin_c = p_source->p[1].i_pitch
501 - p_source->p[1].i_visible_pitch;
502 const int i_dest_margin = p_dest->p->i_pitch
503 - p_dest->p->i_visible_pitch;
505 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
506 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
508 p_line1 = p_line2;
509 p_line2 += p_dest->p->i_pitch;
511 p_y1 = p_y2;
512 p_y2 += p_source->p[Y_PLANE].i_pitch;
514 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
516 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
517 C_YUV420_YVYU( );
518 C_YUV420_YVYU( );
519 C_YUV420_YVYU( );
520 C_YUV420_YVYU( );
521 #else
522 MMX_CALL( MMX_YUV420_YVYU );
523 #endif
525 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
527 C_YUV420_YVYU( );
530 p_y1 += i_source_margin;
531 p_y2 += i_source_margin;
532 p_u += i_source_margin_c;
533 p_v += i_source_margin_c;
534 p_line1 += i_dest_margin;
535 p_line2 += i_dest_margin;
538 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
539 /* re-enable FPU registers */
540 MMX_END;
541 #endif
543 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
545 #endif
547 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
549 ** SSE2 128 bits fetch/store instructions are faster
550 ** if memory access is 16 bytes aligned
552 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
553 ((intptr_t)p_line2|(intptr_t)p_y2))) )
555 /* use faster SSE2 aligned fetch and store */
556 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
558 p_line1 = p_line2;
559 p_line2 += p_dest->p->i_pitch;
561 p_y1 = p_y2;
562 p_y2 += p_source->p[Y_PLANE].i_pitch;
564 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
566 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
568 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
570 C_YUV420_YVYU( );
573 p_y1 += i_source_margin;
574 p_y2 += i_source_margin;
575 p_u += i_source_margin_c;
576 p_v += i_source_margin_c;
577 p_line1 += i_dest_margin;
578 p_line2 += i_dest_margin;
581 else
583 /* use slower SSE2 unaligned fetch and store */
584 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
586 p_line1 = p_line2;
587 p_line2 += p_dest->p->i_pitch;
589 p_y1 = p_y2;
590 p_y2 += p_source->p[Y_PLANE].i_pitch;
592 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
594 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
596 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
598 C_YUV420_YVYU( );
601 p_y1 += i_source_margin;
602 p_y2 += i_source_margin;
603 p_u += i_source_margin_c;
604 p_v += i_source_margin_c;
605 p_line1 += i_dest_margin;
606 p_line2 += i_dest_margin;
609 /* make sure all SSE2 stores are visible thereafter */
610 SSE2_END;
611 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
614 /*****************************************************************************
615 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
616 *****************************************************************************/
617 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
618 picture_t *p_dest )
620 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
621 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
622 uint8_t *p_u = p_source->U_PIXELS;
623 uint8_t *p_v = p_source->V_PIXELS;
625 int i_x, i_y;
627 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
628 #define VEC_NEXT_LINES( ) \
629 p_line1 = p_line2; \
630 p_line2 += p_dest->p->i_pitch; \
631 p_y1 = p_y2; \
632 p_y2 += p_source->p[Y_PLANE].i_pitch;
634 #define VEC_LOAD_UV( ) \
635 u_vec = vec_ld( 0, p_u ); p_u += 16; \
636 v_vec = vec_ld( 0, p_v ); p_v += 16;
638 #define VEC_MERGE( a ) \
639 uv_vec = a( u_vec, v_vec ); \
640 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
641 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
642 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
643 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
644 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
645 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
647 vector unsigned char u_vec;
648 vector unsigned char v_vec;
649 vector unsigned char uv_vec;
650 vector unsigned char y_vec;
652 if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
653 ( p_filter->fmt_in.video.i_height % 2 ) ) )
655 /* Width is a multiple of 32, we take 2 lines at a time */
656 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
658 VEC_NEXT_LINES( );
659 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
661 VEC_LOAD_UV( );
662 VEC_MERGE( vec_mergeh );
663 VEC_MERGE( vec_mergel );
667 else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
668 ( p_filter->fmt_in.video.i_height % 4 ) ) )
670 /* Width is only a multiple of 16, we take 4 lines at a time */
671 for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
673 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
674 VEC_NEXT_LINES( );
675 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
677 VEC_LOAD_UV( );
678 VEC_MERGE( vec_mergeh );
679 VEC_MERGE( vec_mergel );
682 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
683 VEC_LOAD_UV( );
684 VEC_MERGE( vec_mergeh );
686 /* Line 3 and 4, pixels 0 to 16 */
687 VEC_NEXT_LINES( );
688 VEC_MERGE( vec_mergel );
690 /* Line 3 and 4, pixels 16 to ( width ) */
691 for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
693 VEC_LOAD_UV( );
694 VEC_MERGE( vec_mergeh );
695 VEC_MERGE( vec_mergel );
699 else
701 /* Crap, use the C version */
702 #undef VEC_NEXT_LINES
703 #undef VEC_LOAD_UV
704 #undef VEC_MERGE
705 #endif
707 const int i_source_margin = p_source->p[0].i_pitch
708 - p_source->p[0].i_visible_pitch;
709 const int i_source_margin_c = p_source->p[1].i_pitch
710 - p_source->p[1].i_visible_pitch;
711 const int i_dest_margin = p_dest->p->i_pitch
712 - p_dest->p->i_visible_pitch;
714 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
715 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
717 p_line1 = p_line2;
718 p_line2 += p_dest->p->i_pitch;
720 p_y1 = p_y2;
721 p_y2 += p_source->p[Y_PLANE].i_pitch;
723 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
725 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
726 C_YUV420_UYVY( );
727 C_YUV420_UYVY( );
728 C_YUV420_UYVY( );
729 C_YUV420_UYVY( );
730 #else
731 MMX_CALL( MMX_YUV420_UYVY );
732 #endif
734 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
736 C_YUV420_UYVY( );
739 p_y1 += i_source_margin;
740 p_y2 += i_source_margin;
741 p_u += i_source_margin_c;
742 p_v += i_source_margin_c;
743 p_line1 += i_dest_margin;
744 p_line2 += i_dest_margin;
747 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
748 /* re-enable FPU registers */
749 MMX_END;
750 #endif
752 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
754 #endif
756 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
758 ** SSE2 128 bits fetch/store instructions are faster
759 ** if memory access is 16 bytes aligned
761 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
762 ((intptr_t)p_line2|(intptr_t)p_y2))) )
764 /* use faster SSE2 aligned fetch and store */
765 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
767 p_line1 = p_line2;
768 p_line2 += p_dest->p->i_pitch;
770 p_y1 = p_y2;
771 p_y2 += p_source->p[Y_PLANE].i_pitch;
773 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
775 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
777 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
779 C_YUV420_UYVY( );
782 p_y1 += i_source_margin;
783 p_y2 += i_source_margin;
784 p_u += i_source_margin_c;
785 p_v += i_source_margin_c;
786 p_line1 += i_dest_margin;
787 p_line2 += i_dest_margin;
790 else
792 /* use slower SSE2 unaligned fetch and store */
793 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
795 p_line1 = p_line2;
796 p_line2 += p_dest->p->i_pitch;
798 p_y1 = p_y2;
799 p_y2 += p_source->p[Y_PLANE].i_pitch;
801 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
803 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
805 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
807 C_YUV420_UYVY( );
810 p_y1 += i_source_margin;
811 p_y2 += i_source_margin;
812 p_u += i_source_margin_c;
813 p_v += i_source_margin_c;
814 p_line1 += i_dest_margin;
815 p_line2 += i_dest_margin;
818 /* make sure all SSE2 stores are visible thereafter */
819 SSE2_END;
820 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
823 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
824 /*****************************************************************************
825 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
826 *****************************************************************************/
827 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
828 picture_t *p_dest )
830 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
831 /* FIXME: TODO ! */
832 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
835 /*****************************************************************************
836 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
837 *****************************************************************************/
838 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
839 picture_t *p_dest )
841 uint8_t *p_line1 = p_dest->p->p_pixels +
842 p_dest->p->i_visible_lines * p_dest->p->i_pitch
843 + p_dest->p->i_pitch;
844 uint8_t *p_line2 = p_dest->p->p_pixels +
845 p_dest->p->i_visible_lines * p_dest->p->i_pitch;
846 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
847 uint8_t *p_u = p_source->U_PIXELS;
848 uint8_t *p_v = p_source->V_PIXELS;
850 int i_x, i_y;
852 const int i_source_margin = p_source->p[0].i_pitch
853 - p_source->p[0].i_visible_pitch;
854 const int i_source_margin_c = p_source->p[1].i_pitch
855 - p_source->p[1].i_visible_pitch;
856 const int i_dest_margin = p_dest->p->i_pitch
857 - p_dest->p->i_visible_pitch;
859 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
860 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
862 p_line1 -= 3 * p_dest->p->i_pitch;
863 p_line2 -= 3 * p_dest->p->i_pitch;
865 p_y1 = p_y2;
866 p_y2 += p_source->p[Y_PLANE].i_pitch;
868 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
870 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
871 C_YUV420_UYVY( );
872 C_YUV420_UYVY( );
873 C_YUV420_UYVY( );
874 C_YUV420_UYVY( );
875 #else
876 MMX_CALL( MMX_YUV420_UYVY );
877 #endif
879 for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
881 C_YUV420_UYVY( );
884 p_y1 += i_source_margin;
885 p_y2 += i_source_margin;
886 p_u += i_source_margin_c;
887 p_v += i_source_margin_c;
888 p_line1 += i_dest_margin;
889 p_line2 += i_dest_margin;
892 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
893 /* re-enable FPU registers */
894 MMX_END;
895 #endif
897 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
899 ** SSE2 128 bits fetch/store instructions are faster
900 ** if memory access is 16 bytes aligned
902 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
903 ((intptr_t)p_line2|(intptr_t)p_y2))) )
905 /* use faster SSE2 aligned fetch and store */
906 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
908 p_line1 = p_line2;
909 p_line2 += p_dest->p->i_pitch;
911 p_y1 = p_y2;
912 p_y2 += p_source->p[Y_PLANE].i_pitch;
914 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
916 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
918 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
920 C_YUV420_UYVY( );
923 p_y1 += i_source_margin;
924 p_y2 += i_source_margin;
925 p_u += i_source_margin_c;
926 p_v += i_source_margin_c;
927 p_line1 += i_dest_margin;
928 p_line2 += i_dest_margin;
931 else
933 /* use slower SSE2 unaligned fetch and store */
934 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
936 p_line1 = p_line2;
937 p_line2 += p_dest->p->i_pitch;
939 p_y1 = p_y2;
940 p_y2 += p_source->p[Y_PLANE].i_pitch;
942 for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
944 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
946 for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
948 C_YUV420_UYVY( );
951 p_y1 += i_source_margin;
952 p_y2 += i_source_margin;
953 p_u += i_source_margin_c;
954 p_v += i_source_margin_c;
955 p_line1 += i_dest_margin;
956 p_line2 += i_dest_margin;
959 /* make sure all SSE2 stores are visible thereafter */
960 SSE2_END;
961 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
963 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
965 /*****************************************************************************
966 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
967 *****************************************************************************/
968 #if defined (MODULE_NAME_IS_i420_yuy2)
969 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
970 picture_t *p_dest )
972 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
973 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
974 uint8_t *p_u = p_source->U_PIXELS;
975 uint8_t *p_v = p_source->V_PIXELS;
977 int i_x, i_y;
979 const int i_source_margin = p_source->p[0].i_pitch
980 - p_source->p[0].i_visible_pitch;
981 const int i_source_margin_c = p_source->p[1].i_pitch
982 - p_source->p[1].i_visible_pitch;
983 const int i_dest_margin = p_dest->p->i_pitch
984 - p_dest->p->i_visible_pitch;
986 for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
988 p_line1 = p_line2;
989 p_line2 += p_dest->p->i_pitch;
991 p_y1 = p_y2;
992 p_y2 += p_source->p[Y_PLANE].i_pitch;
994 for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
996 C_YUV420_Y211( );
997 C_YUV420_Y211( );
1000 p_y1 += i_source_margin;
1001 p_y2 += i_source_margin;
1002 p_u += i_source_margin_c;
1003 p_v += i_source_margin_c;
1004 p_line1 += i_dest_margin;
1005 p_line2 += i_dest_margin;
1008 #endif