mux: mp4: rework fast start
[vlc.git] / modules / video_chroma / i420_yuy2.c
blobb1a0359fc9b237cd1ce7d6ba87e155396348afdd
1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 VLC authors and VideoLAN
5 * $Id$
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify it
11 * under the terms of the GNU Lesser General Public License as published by
12 * the Free Software Foundation; either version 2.1 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public License
21 * along with this program; if not, write to the Free Software Foundation,
22 * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
26 * Preamble
27 *****************************************************************************/
29 #ifdef HAVE_CONFIG_H
30 # include "config.h"
31 #endif
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
36 #include <vlc_picture.h>
37 #include <vlc_cpu.h>
39 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
40 # include <altivec.h>
41 #endif
43 #include "i420_yuy2.h"
45 #define SRC_FOURCC "I420,IYUV,YV12"
47 #if defined (MODULE_NAME_IS_i420_yuy2)
48 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,Y211"
49 # define VLC_TARGET
50 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
51 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
52 # define VLC_TARGET VLC_MMX
53 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
54 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
55 # define VLC_TARGET VLC_SSE
56 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
57 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
58 # define VLC_TARGET
59 #endif
61 /*****************************************************************************
62 * Local and extern prototypes.
63 *****************************************************************************/
64 static int Activate ( vlc_object_t * );
66 static void I420_YUY2 ( filter_t *, picture_t *, picture_t * );
67 static void I420_YVYU ( filter_t *, picture_t *, picture_t * );
68 static void I420_UYVY ( filter_t *, picture_t *, picture_t * );
69 static picture_t *I420_YUY2_Filter ( filter_t *, picture_t * );
70 static picture_t *I420_YVYU_Filter ( filter_t *, picture_t * );
71 static picture_t *I420_UYVY_Filter ( filter_t *, picture_t * );
72 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
73 static void I420_IUYV ( filter_t *, picture_t *, picture_t * );
74 static picture_t *I420_IUYV_Filter ( filter_t *, picture_t * );
75 #endif
76 #if defined (MODULE_NAME_IS_i420_yuy2)
77 static void I420_Y211 ( filter_t *, picture_t *, picture_t * );
78 static picture_t *I420_Y211_Filter ( filter_t *, picture_t * );
79 #endif
81 /*****************************************************************************
82 * Module descriptor.
83 *****************************************************************************/
84 vlc_module_begin ()
85 #if defined (MODULE_NAME_IS_i420_yuy2)
86 set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
87 set_capability( "video converter", 80 )
88 # define vlc_CPU_capable() (true)
89 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
90 set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
91 set_capability( "video converter", 160 )
92 # define vlc_CPU_capable() vlc_CPU_MMX()
93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
94 set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
95 set_capability( "video converter", 250 )
96 # define vlc_CPU_capable() vlc_CPU_SSE2()
97 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
98 set_description(
99 _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
100 set_capability( "video converter", 250 )
101 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
102 #endif
103 set_callbacks( Activate, NULL )
104 vlc_module_end ()
106 /*****************************************************************************
107 * Activate: allocate a chroma function
108 *****************************************************************************
109 * This function allocates and initializes a chroma function
110 *****************************************************************************/
111 static int Activate( vlc_object_t *p_this )
113 filter_t *p_filter = (filter_t *)p_this;
115 if( !vlc_CPU_capable() )
116 return VLC_EGENERIC;
117 if( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) & 1
118 || (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) & 1 )
120 return -1;
123 if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
124 || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height
125 || p_filter->fmt_in.video.orientation != p_filter->fmt_out.video.orientation )
126 return -1;
128 switch( p_filter->fmt_in.video.i_chroma )
130 // case VLC_CODEC_YV12: FIXME invert U and V in the filters :)
131 case VLC_CODEC_I420:
132 switch( p_filter->fmt_out.video.i_chroma )
134 case VLC_CODEC_YUYV:
135 p_filter->pf_video_filter = I420_YUY2_Filter;
136 break;
138 case VLC_CODEC_YVYU:
139 p_filter->pf_video_filter = I420_YVYU_Filter;
140 break;
142 case VLC_CODEC_UYVY:
143 p_filter->pf_video_filter = I420_UYVY_Filter;
144 break;
145 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
146 case VLC_FOURCC('I','U','Y','V'):
147 p_filter->pf_video_filter = I420_IUYV_Filter;
148 break;
149 #endif
151 #if defined (MODULE_NAME_IS_i420_yuy2)
152 case VLC_CODEC_Y211:
153 p_filter->pf_video_filter = I420_Y211_Filter;
154 break;
155 #endif
157 default:
158 return -1;
160 break;
162 default:
163 return -1;
166 return 0;
169 #if 0
170 static inline unsigned long long read_cycles(void)
172 unsigned long long v;
173 __asm__ __volatile__("rdtsc" : "=A" (v): );
175 return v;
177 #endif
179 /* Following functions are local */
181 VIDEO_FILTER_WRAPPER( I420_YUY2 )
182 VIDEO_FILTER_WRAPPER( I420_YVYU )
183 VIDEO_FILTER_WRAPPER( I420_UYVY )
184 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
185 VIDEO_FILTER_WRAPPER( I420_IUYV )
186 #endif
187 #if defined (MODULE_NAME_IS_i420_yuy2)
188 VIDEO_FILTER_WRAPPER( I420_Y211 )
189 #endif
191 /*****************************************************************************
192 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
193 *****************************************************************************/
194 VLC_TARGET
195 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
196 picture_t *p_dest )
198 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
199 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
200 uint8_t *p_u = p_source->U_PIXELS;
201 uint8_t *p_v = p_source->V_PIXELS;
203 int i_x, i_y;
205 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
206 #define VEC_NEXT_LINES( ) \
207 p_line1 = p_line2; \
208 p_line2 += p_dest->p->i_pitch; \
209 p_y1 = p_y2; \
210 p_y2 += p_source->p[Y_PLANE].i_pitch;
212 #define VEC_LOAD_UV( ) \
213 u_vec = vec_ld( 0, p_u ); p_u += 16; \
214 v_vec = vec_ld( 0, p_v ); p_v += 16;
216 #define VEC_MERGE( a ) \
217 uv_vec = a( u_vec, v_vec ); \
218 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
219 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
220 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
221 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
222 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
223 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
225 vector unsigned char u_vec;
226 vector unsigned char v_vec;
227 vector unsigned char uv_vec;
228 vector unsigned char y_vec;
230 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
231 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
233 /* Width is a multiple of 32, we take 2 lines at a time */
234 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
236 VEC_NEXT_LINES( );
237 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
239 VEC_LOAD_UV( );
240 VEC_MERGE( vec_mergeh );
241 VEC_MERGE( vec_mergel );
245 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
246 #if 0
247 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
248 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
250 /* Width is only a multiple of 16, we take 4 lines at a time */
251 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
253 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
254 VEC_NEXT_LINES( );
255 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
257 VEC_LOAD_UV( );
258 VEC_MERGE( vec_mergeh );
259 VEC_MERGE( vec_mergel );
262 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
263 VEC_LOAD_UV( );
264 VEC_MERGE( vec_mergeh );
266 /* Line 3 and 4, pixels 0 to 16 */
267 VEC_NEXT_LINES( );
268 VEC_MERGE( vec_mergel );
270 /* Line 3 and 4, pixels 16 to ( width ) */
271 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
273 VEC_LOAD_UV( );
274 VEC_MERGE( vec_mergeh );
275 VEC_MERGE( vec_mergel );
279 #endif
280 else
282 /* Crap, use the C version */
283 #undef VEC_NEXT_LINES
284 #undef VEC_LOAD_UV
285 #undef VEC_MERGE
286 #endif
288 const int i_source_margin = p_source->p[0].i_pitch
289 - p_source->p[0].i_visible_pitch
290 - p_filter->fmt_in.video.i_x_offset;
291 const int i_source_margin_c = p_source->p[1].i_pitch
292 - p_source->p[1].i_visible_pitch
293 - ( p_filter->fmt_in.video.i_x_offset / 2 );
294 const int i_dest_margin = p_dest->p->i_pitch
295 - p_dest->p->i_visible_pitch
296 - ( p_filter->fmt_out.video.i_x_offset * 2 );
298 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
299 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
301 p_line1 = p_line2;
302 p_line2 += p_dest->p->i_pitch;
304 p_y1 = p_y2;
305 p_y2 += p_source->p[Y_PLANE].i_pitch;
307 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
308 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x-- ; )
310 C_YUV420_YUYV( );
311 C_YUV420_YUYV( );
312 C_YUV420_YUYV( );
313 C_YUV420_YUYV( );
315 #else
316 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
318 MMX_CALL( MMX_YUV420_YUYV );
320 #endif
321 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
323 C_YUV420_YUYV( );
326 p_y2 += i_source_margin;
327 p_u += i_source_margin_c;
328 p_v += i_source_margin_c;
329 p_line2 += i_dest_margin;
332 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
333 /* re-enable FPU registers */
334 MMX_END;
335 #endif
337 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
339 #endif
341 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
343 ** SSE2 128 bits fetch/store instructions are faster
344 ** if memory access is 16 bytes aligned
347 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
348 ((intptr_t)p_line2|(intptr_t)p_y2))) )
350 /* use faster SSE2 aligned fetch and store */
351 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
353 p_line1 = p_line2;
354 p_line2 += p_dest->p->i_pitch;
356 p_y1 = p_y2;
357 p_y2 += p_source->p[Y_PLANE].i_pitch;
359 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
361 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
363 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
365 C_YUV420_YUYV( );
368 p_y2 += i_source_margin;
369 p_u += i_source_margin_c;
370 p_v += i_source_margin_c;
371 p_line2 += i_dest_margin;
374 else
376 /* use slower SSE2 unaligned fetch and store */
377 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
379 p_line1 = p_line2;
380 p_line2 += p_dest->p->i_pitch;
382 p_y1 = p_y2;
383 p_y2 += p_source->p[Y_PLANE].i_pitch;
385 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
387 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
389 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
391 C_YUV420_YUYV( );
394 p_y2 += i_source_margin;
395 p_u += i_source_margin_c;
396 p_v += i_source_margin_c;
397 p_line2 += i_dest_margin;
400 /* make sure all SSE2 stores are visible thereafter */
401 SSE2_END;
403 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
406 /*****************************************************************************
407 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
408 *****************************************************************************/
409 VLC_TARGET
410 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
411 picture_t *p_dest )
413 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
414 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
415 uint8_t *p_u = p_source->U_PIXELS;
416 uint8_t *p_v = p_source->V_PIXELS;
418 int i_x, i_y;
420 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
421 #define VEC_NEXT_LINES( ) \
422 p_line1 = p_line2; \
423 p_line2 += p_dest->p->i_pitch; \
424 p_y1 = p_y2; \
425 p_y2 += p_source->p[Y_PLANE].i_pitch;
427 #define VEC_LOAD_UV( ) \
428 u_vec = vec_ld( 0, p_u ); p_u += 16; \
429 v_vec = vec_ld( 0, p_v ); p_v += 16;
431 #define VEC_MERGE( a ) \
432 vu_vec = a( v_vec, u_vec ); \
433 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
434 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
435 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
436 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
437 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
438 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
440 vector unsigned char u_vec;
441 vector unsigned char v_vec;
442 vector unsigned char vu_vec;
443 vector unsigned char y_vec;
445 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
446 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
448 /* Width is a multiple of 32, we take 2 lines at a time */
449 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
451 VEC_NEXT_LINES( );
452 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
454 VEC_LOAD_UV( );
455 VEC_MERGE( vec_mergeh );
456 VEC_MERGE( vec_mergel );
460 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
461 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
463 /* Width is only a multiple of 16, we take 4 lines at a time */
464 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
466 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
467 VEC_NEXT_LINES( );
468 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
470 VEC_LOAD_UV( );
471 VEC_MERGE( vec_mergeh );
472 VEC_MERGE( vec_mergel );
475 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
476 VEC_LOAD_UV( );
477 VEC_MERGE( vec_mergeh );
479 /* Line 3 and 4, pixels 0 to 16 */
480 VEC_NEXT_LINES( );
481 VEC_MERGE( vec_mergel );
483 /* Line 3 and 4, pixels 16 to ( width ) */
484 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
486 VEC_LOAD_UV( );
487 VEC_MERGE( vec_mergeh );
488 VEC_MERGE( vec_mergel );
492 else
494 /* Crap, use the C version */
495 #undef VEC_NEXT_LINES
496 #undef VEC_LOAD_UV
497 #undef VEC_MERGE
498 #endif
500 const int i_source_margin = p_source->p[0].i_pitch
501 - p_source->p[0].i_visible_pitch
502 - p_filter->fmt_in.video.i_x_offset;
503 const int i_source_margin_c = p_source->p[1].i_pitch
504 - p_source->p[1].i_visible_pitch
505 - ( p_filter->fmt_in.video.i_x_offset / 2 );
506 const int i_dest_margin = p_dest->p->i_pitch
507 - p_dest->p->i_visible_pitch
508 - ( p_filter->fmt_out.video.i_x_offset * 2 );
510 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
511 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
513 p_line1 = p_line2;
514 p_line2 += p_dest->p->i_pitch;
516 p_y1 = p_y2;
517 p_y2 += p_source->p[Y_PLANE].i_pitch;
519 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
521 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
522 C_YUV420_YVYU( );
523 C_YUV420_YVYU( );
524 C_YUV420_YVYU( );
525 C_YUV420_YVYU( );
526 #else
527 MMX_CALL( MMX_YUV420_YVYU );
528 #endif
530 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x-- ; )
532 C_YUV420_YVYU( );
535 p_y1 += i_source_margin;
536 p_y2 += i_source_margin;
537 p_u += i_source_margin_c;
538 p_v += i_source_margin_c;
539 p_line1 += i_dest_margin;
540 p_line2 += i_dest_margin;
543 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
544 /* re-enable FPU registers */
545 MMX_END;
546 #endif
548 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
550 #endif
552 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
554 ** SSE2 128 bits fetch/store instructions are faster
555 ** if memory access is 16 bytes aligned
557 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
558 ((intptr_t)p_line2|(intptr_t)p_y2))) )
560 /* use faster SSE2 aligned fetch and store */
561 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
563 p_line1 = p_line2;
564 p_line2 += p_dest->p->i_pitch;
566 p_y1 = p_y2;
567 p_y2 += p_source->p[Y_PLANE].i_pitch;
569 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
571 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
573 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
575 C_YUV420_YVYU( );
578 p_y1 += i_source_margin;
579 p_y2 += i_source_margin;
580 p_u += i_source_margin_c;
581 p_v += i_source_margin_c;
582 p_line1 += i_dest_margin;
583 p_line2 += i_dest_margin;
586 else
588 /* use slower SSE2 unaligned fetch and store */
589 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
591 p_line1 = p_line2;
592 p_line2 += p_dest->p->i_pitch;
594 p_y1 = p_y2;
595 p_y2 += p_source->p[Y_PLANE].i_pitch;
597 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
599 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
601 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
603 C_YUV420_YVYU( );
606 p_y1 += i_source_margin;
607 p_y2 += i_source_margin;
608 p_u += i_source_margin_c;
609 p_v += i_source_margin_c;
610 p_line1 += i_dest_margin;
611 p_line2 += i_dest_margin;
614 /* make sure all SSE2 stores are visible thereafter */
615 SSE2_END;
616 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
619 /*****************************************************************************
620 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
621 *****************************************************************************/
622 VLC_TARGET
623 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
624 picture_t *p_dest )
626 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
627 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
628 uint8_t *p_u = p_source->U_PIXELS;
629 uint8_t *p_v = p_source->V_PIXELS;
631 int i_x, i_y;
633 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
634 #define VEC_NEXT_LINES( ) \
635 p_line1 = p_line2; \
636 p_line2 += p_dest->p->i_pitch; \
637 p_y1 = p_y2; \
638 p_y2 += p_source->p[Y_PLANE].i_pitch;
640 #define VEC_LOAD_UV( ) \
641 u_vec = vec_ld( 0, p_u ); p_u += 16; \
642 v_vec = vec_ld( 0, p_v ); p_v += 16;
644 #define VEC_MERGE( a ) \
645 uv_vec = a( u_vec, v_vec ); \
646 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
647 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
648 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
649 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
650 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
651 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
653 vector unsigned char u_vec;
654 vector unsigned char v_vec;
655 vector unsigned char uv_vec;
656 vector unsigned char y_vec;
658 if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) |
659 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 2 ) ) )
661 /* Width is a multiple of 32, we take 2 lines at a time */
662 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
664 VEC_NEXT_LINES( );
665 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
667 VEC_LOAD_UV( );
668 VEC_MERGE( vec_mergeh );
669 VEC_MERGE( vec_mergel );
673 else if( !( ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) |
674 ( (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) % 4 ) ) )
676 /* Width is only a multiple of 16, we take 4 lines at a time */
677 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 4 ; i_y-- ; )
679 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
680 VEC_NEXT_LINES( );
681 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
683 VEC_LOAD_UV( );
684 VEC_MERGE( vec_mergeh );
685 VEC_MERGE( vec_mergel );
688 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
689 VEC_LOAD_UV( );
690 VEC_MERGE( vec_mergeh );
692 /* Line 3 and 4, pixels 0 to 16 */
693 VEC_NEXT_LINES( );
694 VEC_MERGE( vec_mergel );
696 /* Line 3 and 4, pixels 16 to ( width ) */
697 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
699 VEC_LOAD_UV( );
700 VEC_MERGE( vec_mergeh );
701 VEC_MERGE( vec_mergel );
705 else
707 /* Crap, use the C version */
708 #undef VEC_NEXT_LINES
709 #undef VEC_LOAD_UV
710 #undef VEC_MERGE
711 #endif
713 const int i_source_margin = p_source->p[0].i_pitch
714 - p_source->p[0].i_visible_pitch
715 - p_filter->fmt_in.video.i_x_offset;
716 const int i_source_margin_c = p_source->p[1].i_pitch
717 - p_source->p[1].i_visible_pitch
718 - ( p_filter->fmt_in.video.i_x_offset / 2 );
719 const int i_dest_margin = p_dest->p->i_pitch
720 - p_dest->p->i_visible_pitch
721 - ( p_filter->fmt_out.video.i_x_offset * 2 );
723 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
724 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
726 p_line1 = p_line2;
727 p_line2 += p_dest->p->i_pitch;
729 p_y1 = p_y2;
730 p_y2 += p_source->p[Y_PLANE].i_pitch;
732 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
734 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
735 C_YUV420_UYVY( );
736 C_YUV420_UYVY( );
737 C_YUV420_UYVY( );
738 C_YUV420_UYVY( );
739 #else
740 MMX_CALL( MMX_YUV420_UYVY );
741 #endif
743 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 8 ) / 2; i_x--; )
745 C_YUV420_UYVY( );
748 p_y1 += i_source_margin;
749 p_y2 += i_source_margin;
750 p_u += i_source_margin_c;
751 p_v += i_source_margin_c;
752 p_line1 += i_dest_margin;
753 p_line2 += i_dest_margin;
756 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
757 /* re-enable FPU registers */
758 MMX_END;
759 #endif
761 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
763 #endif
765 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
767 ** SSE2 128 bits fetch/store instructions are faster
768 ** if memory access is 16 bytes aligned
770 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
771 ((intptr_t)p_line2|(intptr_t)p_y2))) )
773 /* use faster SSE2 aligned fetch and store */
774 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
776 p_line1 = p_line2;
777 p_line2 += p_dest->p->i_pitch;
779 p_y1 = p_y2;
780 p_y2 += p_source->p[Y_PLANE].i_pitch;
782 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
784 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
786 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
788 C_YUV420_UYVY( );
791 p_y1 += i_source_margin;
792 p_y2 += i_source_margin;
793 p_u += i_source_margin_c;
794 p_v += i_source_margin_c;
795 p_line1 += i_dest_margin;
796 p_line2 += i_dest_margin;
799 else
801 /* use slower SSE2 unaligned fetch and store */
802 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
804 p_line1 = p_line2;
805 p_line2 += p_dest->p->i_pitch;
807 p_y1 = p_y2;
808 p_y2 += p_source->p[Y_PLANE].i_pitch;
810 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16 ; i_x-- ; )
812 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
814 for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 16 ) / 2; i_x-- ; )
816 C_YUV420_UYVY( );
819 p_y1 += i_source_margin;
820 p_y2 += i_source_margin;
821 p_u += i_source_margin_c;
822 p_v += i_source_margin_c;
823 p_line1 += i_dest_margin;
824 p_line2 += i_dest_margin;
827 /* make sure all SSE2 stores are visible thereafter */
828 SSE2_END;
829 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
832 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
833 /*****************************************************************************
834 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
835 *****************************************************************************/
836 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
837 picture_t *p_dest )
839 VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
840 /* FIXME: TODO ! */
841 msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
843 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
845 /*****************************************************************************
846 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
847 *****************************************************************************/
848 #if defined (MODULE_NAME_IS_i420_yuy2)
849 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
850 picture_t *p_dest )
852 uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
853 uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
854 uint8_t *p_u = p_source->U_PIXELS;
855 uint8_t *p_v = p_source->V_PIXELS;
857 int i_x, i_y;
859 const int i_source_margin = p_source->p[0].i_pitch
860 - p_source->p[0].i_visible_pitch
861 - p_filter->fmt_in.video.i_x_offset;
862 const int i_source_margin_c = p_source->p[1].i_pitch
863 - p_source->p[1].i_visible_pitch
864 - ( p_filter->fmt_in.video.i_x_offset / 2 );
865 const int i_dest_margin = p_dest->p->i_pitch
866 - p_dest->p->i_visible_pitch
867 - ( p_filter->fmt_out.video.i_x_offset * 2 );
869 for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
871 p_line1 = p_line2;
872 p_line2 += p_dest->p->i_pitch;
874 p_y1 = p_y2;
875 p_y2 += p_source->p[Y_PLANE].i_pitch;
877 for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8 ; i_x-- ; )
879 C_YUV420_Y211( );
880 C_YUV420_Y211( );
883 p_y1 += i_source_margin;
884 p_y2 += i_source_margin;
885 p_u += i_source_margin_c;
886 p_v += i_source_margin_c;
887 p_line1 += i_dest_margin;
888 p_line2 += i_dest_margin;
891 #endif