1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
42 #include "i420_yuy2.h"
44 #define SRC_FOURCC "I420,IYUV,YV12"
46 #if defined (MODULE_NAME_IS_i420_yuy2)
47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
51 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
53 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
56 /*****************************************************************************
57 * Local and extern prototypes.
58 *****************************************************************************/
59 static int Activate ( vlc_object_t
* );
61 static void I420_YUY2 ( filter_t
*, picture_t
*, picture_t
* );
62 static void I420_YVYU ( filter_t
*, picture_t
*, picture_t
* );
63 static void I420_UYVY ( filter_t
*, picture_t
*, picture_t
* );
64 static picture_t
*I420_YUY2_Filter ( filter_t
*, picture_t
* );
65 static picture_t
*I420_YVYU_Filter ( filter_t
*, picture_t
* );
66 static picture_t
*I420_UYVY_Filter ( filter_t
*, picture_t
* );
67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
68 static void I420_IUYV ( filter_t
*, picture_t
*, picture_t
* );
69 static void I420_cyuv ( filter_t
*, picture_t
*, picture_t
* );
70 static picture_t
*I420_IUYV_Filter ( filter_t
*, picture_t
* );
71 static picture_t
*I420_cyuv_Filter ( filter_t
*, picture_t
* );
73 #if defined (MODULE_NAME_IS_i420_yuy2)
74 static void I420_Y211 ( filter_t
*, picture_t
*, picture_t
* );
75 static picture_t
*I420_Y211_Filter ( filter_t
*, picture_t
* );
78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
79 /* Initialize MMX-specific constants */
80 static const uint64_t i_00ffw
= 0x00ff00ff00ff00ffULL
;
81 static const uint64_t i_80w
= 0x0000000080808080ULL
;
84 /*****************************************************************************
86 *****************************************************************************/
88 #if defined (MODULE_NAME_IS_i420_yuy2)
89 set_description( N_("Conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
90 set_capability( "video filter2", 80 )
91 # define CPU_CAPABILITY 0
92 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
93 set_description( N_("MMX conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
94 set_capability( "video filter2", 160 )
95 # define CPU_CAPABILITY CPU_CAPABILITY_MMX
96 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
97 set_description( N_("SSE2 conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
98 set_capability( "video filter2", 250 )
99 # define CPU_CAPABILITY CPU_CAPABILITY_SSE2
100 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
102 _("AltiVec conversions from " SRC_FOURCC
" to " DEST_FOURCC
) );
103 set_capability( "video filter2", 250 )
104 # define CPU_CAPABILITY CPU_CAPABILITY_ALTIVEC
106 set_callbacks( Activate
, NULL
)
109 /*****************************************************************************
110 * Activate: allocate a chroma function
111 *****************************************************************************
112 * This function allocates and initializes a chroma function
113 *****************************************************************************/
114 static int Activate( vlc_object_t
*p_this
)
116 filter_t
*p_filter
= (filter_t
*)p_this
;
119 if( !(vlc_CPU() & CPU_CAPABILITY
) )
122 if( p_filter
->fmt_in
.video
.i_width
& 1
123 || p_filter
->fmt_in
.video
.i_height
& 1 )
128 if( p_filter
->fmt_in
.video
.i_width
!= p_filter
->fmt_out
.video
.i_width
129 || p_filter
->fmt_in
.video
.i_height
!= p_filter
->fmt_out
.video
.i_height
)
132 switch( p_filter
->fmt_in
.video
.i_chroma
)
136 switch( p_filter
->fmt_out
.video
.i_chroma
)
139 p_filter
->pf_video_filter
= I420_YUY2_Filter
;
143 p_filter
->pf_video_filter
= I420_YVYU_Filter
;
147 p_filter
->pf_video_filter
= I420_UYVY_Filter
;
149 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
150 case VLC_FOURCC('I','U','Y','V'):
151 p_filter
->pf_video_filter
= I420_IUYV_Filter
;
155 p_filter
->pf_video_filter
= I420_cyuv_Filter
;
159 #if defined (MODULE_NAME_IS_i420_yuy2)
161 p_filter
->pf_video_filter
= I420_Y211_Filter
;
178 static inline unsigned long long read_cycles(void)
180 unsigned long long v
;
181 __asm__
__volatile__("rdtsc" : "=A" (v
): );
187 /* Following functions are local */
189 VIDEO_FILTER_WRAPPER( I420_YUY2
)
190 VIDEO_FILTER_WRAPPER( I420_YVYU
)
191 VIDEO_FILTER_WRAPPER( I420_UYVY
)
192 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
193 VIDEO_FILTER_WRAPPER( I420_IUYV
)
194 VIDEO_FILTER_WRAPPER( I420_cyuv
)
196 #if defined (MODULE_NAME_IS_i420_yuy2)
197 VIDEO_FILTER_WRAPPER( I420_Y211
)
200 /*****************************************************************************
201 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
202 *****************************************************************************/
203 static void I420_YUY2( filter_t
*p_filter
, picture_t
*p_source
,
206 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
207 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
208 uint8_t *p_u
= p_source
->U_PIXELS
;
209 uint8_t *p_v
= p_source
->V_PIXELS
;
213 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
214 #define VEC_NEXT_LINES( ) \
216 p_line2 += p_dest->p->i_pitch; \
218 p_y2 += p_source->p[Y_PLANE].i_pitch;
220 #define VEC_LOAD_UV( ) \
221 u_vec = vec_ld( 0, p_u ); p_u += 16; \
222 v_vec = vec_ld( 0, p_v ); p_v += 16;
224 #define VEC_MERGE( a ) \
225 uv_vec = a( u_vec, v_vec ); \
226 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
227 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
228 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
229 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
230 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
231 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
233 vector
unsigned char u_vec
;
234 vector
unsigned char v_vec
;
235 vector
unsigned char uv_vec
;
236 vector
unsigned char y_vec
;
238 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
239 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
241 /* Width is a multiple of 32, we take 2 lines at a time */
242 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
245 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
248 VEC_MERGE( vec_mergeh
);
249 VEC_MERGE( vec_mergel
);
253 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
255 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
256 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
258 /* Width is only a multiple of 16, we take 4 lines at a time */
259 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
261 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
263 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
266 VEC_MERGE( vec_mergeh
);
267 VEC_MERGE( vec_mergel
);
270 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
272 VEC_MERGE( vec_mergeh
);
274 /* Line 3 and 4, pixels 0 to 16 */
276 VEC_MERGE( vec_mergel
);
278 /* Line 3 and 4, pixels 16 to ( width ) */
279 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
282 VEC_MERGE( vec_mergeh
);
283 VEC_MERGE( vec_mergel
);
290 /* Crap, use the C version */
291 #undef VEC_NEXT_LINES
296 const int i_source_margin
= p_source
->p
[0].i_pitch
297 - p_source
->p
[0].i_visible_pitch
;
298 const int i_source_margin_c
= p_source
->p
[1].i_pitch
299 - p_source
->p
[1].i_visible_pitch
;
300 const int i_dest_margin
= p_dest
->p
->i_pitch
301 - p_dest
->p
->i_visible_pitch
;
303 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
304 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
307 p_line2
+= p_dest
->p
->i_pitch
;
310 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
312 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
313 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8; i_x
-- ; )
321 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
323 MMX_CALL( MMX_YUV420_YUYV
);
326 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
331 p_y1
+= i_source_margin
;
332 p_y2
+= i_source_margin
;
333 p_u
+= i_source_margin_c
;
334 p_v
+= i_source_margin_c
;
335 p_line1
+= i_dest_margin
;
336 p_line2
+= i_dest_margin
;
339 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
340 /* re-enable FPU registers */
344 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
348 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
350 ** SSE2 128 bits fetch/store instructions are faster
351 ** if memory access is 16 bytes aligned
354 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
355 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
357 /* use faster SSE2 aligned fetch and store */
358 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
361 p_line2
+= p_dest
->p
->i_pitch
;
364 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
366 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
368 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED
);
370 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
375 p_y1
+= i_source_margin
;
376 p_y2
+= i_source_margin
;
377 p_u
+= i_source_margin_c
;
378 p_v
+= i_source_margin_c
;
379 p_line1
+= i_dest_margin
;
380 p_line2
+= i_dest_margin
;
385 /* use slower SSE2 unaligned fetch and store */
386 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
389 p_line2
+= p_dest
->p
->i_pitch
;
392 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
394 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
396 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED
);
398 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
403 p_y1
+= i_source_margin
;
404 p_y2
+= i_source_margin
;
405 p_u
+= i_source_margin_c
;
406 p_v
+= i_source_margin_c
;
407 p_line1
+= i_dest_margin
;
408 p_line2
+= i_dest_margin
;
411 /* make sure all SSE2 stores are visible thereafter */
414 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
417 /*****************************************************************************
418 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
419 *****************************************************************************/
420 static void I420_YVYU( filter_t
*p_filter
, picture_t
*p_source
,
423 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
424 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
425 uint8_t *p_u
= p_source
->U_PIXELS
;
426 uint8_t *p_v
= p_source
->V_PIXELS
;
430 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
431 #define VEC_NEXT_LINES( ) \
433 p_line2 += p_dest->p->i_pitch; \
435 p_y2 += p_source->p[Y_PLANE].i_pitch;
437 #define VEC_LOAD_UV( ) \
438 u_vec = vec_ld( 0, p_u ); p_u += 16; \
439 v_vec = vec_ld( 0, p_v ); p_v += 16;
441 #define VEC_MERGE( a ) \
442 vu_vec = a( v_vec, u_vec ); \
443 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
444 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
445 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
446 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
447 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
448 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
450 vector
unsigned char u_vec
;
451 vector
unsigned char v_vec
;
452 vector
unsigned char vu_vec
;
453 vector
unsigned char y_vec
;
455 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
456 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
458 /* Width is a multiple of 32, we take 2 lines at a time */
459 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
462 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
465 VEC_MERGE( vec_mergeh
);
466 VEC_MERGE( vec_mergel
);
470 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
471 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
473 /* Width is only a multiple of 16, we take 4 lines at a time */
474 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
476 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
478 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
481 VEC_MERGE( vec_mergeh
);
482 VEC_MERGE( vec_mergel
);
485 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
487 VEC_MERGE( vec_mergeh
);
489 /* Line 3 and 4, pixels 0 to 16 */
491 VEC_MERGE( vec_mergel
);
493 /* Line 3 and 4, pixels 16 to ( width ) */
494 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
497 VEC_MERGE( vec_mergeh
);
498 VEC_MERGE( vec_mergel
);
504 /* Crap, use the C version */
505 #undef VEC_NEXT_LINES
510 const int i_source_margin
= p_source
->p
[0].i_pitch
511 - p_source
->p
[0].i_visible_pitch
;
512 const int i_source_margin_c
= p_source
->p
[1].i_pitch
513 - p_source
->p
[1].i_visible_pitch
;
514 const int i_dest_margin
= p_dest
->p
->i_pitch
515 - p_dest
->p
->i_visible_pitch
;
517 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
518 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
521 p_line2
+= p_dest
->p
->i_pitch
;
524 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
526 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
528 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
534 MMX_CALL( MMX_YUV420_YVYU
);
537 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
542 p_y1
+= i_source_margin
;
543 p_y2
+= i_source_margin
;
544 p_u
+= i_source_margin_c
;
545 p_v
+= i_source_margin_c
;
546 p_line1
+= i_dest_margin
;
547 p_line2
+= i_dest_margin
;
550 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
551 /* re-enable FPU registers */
555 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
559 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
561 ** SSE2 128 bits fetch/store instructions are faster
562 ** if memory access is 16 bytes aligned
564 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
565 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
567 /* use faster SSE2 aligned fetch and store */
568 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
571 p_line2
+= p_dest
->p
->i_pitch
;
574 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
576 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
578 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED
);
580 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
585 p_y1
+= i_source_margin
;
586 p_y2
+= i_source_margin
;
587 p_u
+= i_source_margin_c
;
588 p_v
+= i_source_margin_c
;
589 p_line1
+= i_dest_margin
;
590 p_line2
+= i_dest_margin
;
595 /* use slower SSE2 unaligned fetch and store */
596 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
599 p_line2
+= p_dest
->p
->i_pitch
;
602 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
604 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
606 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED
);
608 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
613 p_y1
+= i_source_margin
;
614 p_y2
+= i_source_margin
;
615 p_u
+= i_source_margin_c
;
616 p_v
+= i_source_margin_c
;
617 p_line1
+= i_dest_margin
;
618 p_line2
+= i_dest_margin
;
621 /* make sure all SSE2 stores are visible thereafter */
623 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
626 /*****************************************************************************
627 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
628 *****************************************************************************/
629 static void I420_UYVY( filter_t
*p_filter
, picture_t
*p_source
,
632 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
633 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
634 uint8_t *p_u
= p_source
->U_PIXELS
;
635 uint8_t *p_v
= p_source
->V_PIXELS
;
639 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
640 #define VEC_NEXT_LINES( ) \
642 p_line2 += p_dest->p->i_pitch; \
644 p_y2 += p_source->p[Y_PLANE].i_pitch;
646 #define VEC_LOAD_UV( ) \
647 u_vec = vec_ld( 0, p_u ); p_u += 16; \
648 v_vec = vec_ld( 0, p_v ); p_v += 16;
650 #define VEC_MERGE( a ) \
651 uv_vec = a( u_vec, v_vec ); \
652 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
653 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
654 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
655 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
656 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
657 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
659 vector
unsigned char u_vec
;
660 vector
unsigned char v_vec
;
661 vector
unsigned char uv_vec
;
662 vector
unsigned char y_vec
;
664 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
665 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
667 /* Width is a multiple of 32, we take 2 lines at a time */
668 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
671 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
674 VEC_MERGE( vec_mergeh
);
675 VEC_MERGE( vec_mergel
);
679 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
680 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
682 /* Width is only a multiple of 16, we take 4 lines at a time */
683 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
685 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
687 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
690 VEC_MERGE( vec_mergeh
);
691 VEC_MERGE( vec_mergel
);
694 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
696 VEC_MERGE( vec_mergeh
);
698 /* Line 3 and 4, pixels 0 to 16 */
700 VEC_MERGE( vec_mergel
);
702 /* Line 3 and 4, pixels 16 to ( width ) */
703 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
706 VEC_MERGE( vec_mergeh
);
707 VEC_MERGE( vec_mergel
);
713 /* Crap, use the C version */
714 #undef VEC_NEXT_LINES
719 const int i_source_margin
= p_source
->p
[0].i_pitch
720 - p_source
->p
[0].i_visible_pitch
;
721 const int i_source_margin_c
= p_source
->p
[1].i_pitch
722 - p_source
->p
[1].i_visible_pitch
;
723 const int i_dest_margin
= p_dest
->p
->i_pitch
724 - p_dest
->p
->i_visible_pitch
;
726 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
727 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
730 p_line2
+= p_dest
->p
->i_pitch
;
733 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
735 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
737 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
743 MMX_CALL( MMX_YUV420_UYVY
);
746 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
--; )
751 p_y1
+= i_source_margin
;
752 p_y2
+= i_source_margin
;
753 p_u
+= i_source_margin_c
;
754 p_v
+= i_source_margin_c
;
755 p_line1
+= i_dest_margin
;
756 p_line2
+= i_dest_margin
;
759 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
760 /* re-enable FPU registers */
764 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
768 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
770 ** SSE2 128 bits fetch/store instructions are faster
771 ** if memory access is 16 bytes aligned
773 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
774 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
776 /* use faster SSE2 aligned fetch and store */
777 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
780 p_line2
+= p_dest
->p
->i_pitch
;
783 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
785 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
787 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED
);
789 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
794 p_y1
+= i_source_margin
;
795 p_y2
+= i_source_margin
;
796 p_u
+= i_source_margin_c
;
797 p_v
+= i_source_margin_c
;
798 p_line1
+= i_dest_margin
;
799 p_line2
+= i_dest_margin
;
804 /* use slower SSE2 unaligned fetch and store */
805 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
808 p_line2
+= p_dest
->p
->i_pitch
;
811 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
813 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
815 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED
);
817 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
822 p_y1
+= i_source_margin
;
823 p_y2
+= i_source_margin
;
824 p_u
+= i_source_margin_c
;
825 p_v
+= i_source_margin_c
;
826 p_line1
+= i_dest_margin
;
827 p_line2
+= i_dest_margin
;
830 /* make sure all SSE2 stores are visible thereafter */
832 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
835 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
836 /*****************************************************************************
837 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
838 *****************************************************************************/
839 static void I420_IUYV( filter_t
*p_filter
, picture_t
*p_source
,
842 VLC_UNUSED(p_source
); VLC_UNUSED(p_dest
);
844 msg_Err( p_filter
, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
847 /*****************************************************************************
848 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
849 *****************************************************************************/
850 static void I420_cyuv( filter_t
*p_filter
, picture_t
*p_source
,
853 uint8_t *p_line1
= p_dest
->p
->p_pixels
+
854 p_dest
->p
->i_visible_lines
* p_dest
->p
->i_pitch
855 + p_dest
->p
->i_pitch
;
856 uint8_t *p_line2
= p_dest
->p
->p_pixels
+
857 p_dest
->p
->i_visible_lines
* p_dest
->p
->i_pitch
;
858 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
859 uint8_t *p_u
= p_source
->U_PIXELS
;
860 uint8_t *p_v
= p_source
->V_PIXELS
;
864 const int i_source_margin
= p_source
->p
[0].i_pitch
865 - p_source
->p
[0].i_visible_pitch
;
866 const int i_source_margin_c
= p_source
->p
[1].i_pitch
867 - p_source
->p
[1].i_visible_pitch
;
868 const int i_dest_margin
= p_dest
->p
->i_pitch
869 - p_dest
->p
->i_visible_pitch
;
871 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
872 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
874 p_line1
-= 3 * p_dest
->p
->i_pitch
;
875 p_line2
-= 3 * p_dest
->p
->i_pitch
;
878 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
880 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
882 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
888 MMX_CALL( MMX_YUV420_UYVY
);
891 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
896 p_y1
+= i_source_margin
;
897 p_y2
+= i_source_margin
;
898 p_u
+= i_source_margin_c
;
899 p_v
+= i_source_margin_c
;
900 p_line1
+= i_dest_margin
;
901 p_line2
+= i_dest_margin
;
904 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
905 /* re-enable FPU registers */
909 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
911 ** SSE2 128 bits fetch/store instructions are faster
912 ** if memory access is 16 bytes aligned
914 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
915 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
917 /* use faster SSE2 aligned fetch and store */
918 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
921 p_line2
+= p_dest
->p
->i_pitch
;
924 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
926 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
928 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED
);
930 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
935 p_y1
+= i_source_margin
;
936 p_y2
+= i_source_margin
;
937 p_u
+= i_source_margin_c
;
938 p_v
+= i_source_margin_c
;
939 p_line1
+= i_dest_margin
;
940 p_line2
+= i_dest_margin
;
945 /* use slower SSE2 unaligned fetch and store */
946 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
949 p_line2
+= p_dest
->p
->i_pitch
;
952 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
954 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
956 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED
);
958 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
963 p_y1
+= i_source_margin
;
964 p_y2
+= i_source_margin
;
965 p_u
+= i_source_margin_c
;
966 p_v
+= i_source_margin_c
;
967 p_line1
+= i_dest_margin
;
968 p_line2
+= i_dest_margin
;
971 /* make sure all SSE2 stores are visible thereafter */
973 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
975 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
977 /*****************************************************************************
978 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
979 *****************************************************************************/
980 #if defined (MODULE_NAME_IS_i420_yuy2)
981 static void I420_Y211( filter_t
*p_filter
, picture_t
*p_source
,
984 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
985 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
986 uint8_t *p_u
= p_source
->U_PIXELS
;
987 uint8_t *p_v
= p_source
->V_PIXELS
;
991 const int i_source_margin
= p_source
->p
[0].i_pitch
992 - p_source
->p
[0].i_visible_pitch
;
993 const int i_source_margin_c
= p_source
->p
[1].i_pitch
994 - p_source
->p
[1].i_visible_pitch
;
995 const int i_dest_margin
= p_dest
->p
->i_pitch
996 - p_dest
->p
->i_visible_pitch
;
998 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
1001 p_line2
+= p_dest
->p
->i_pitch
;
1004 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
1006 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
1012 p_y1
+= i_source_margin
;
1013 p_y2
+= i_source_margin
;
1014 p_u
+= i_source_margin_c
;
1015 p_v
+= i_source_margin_c
;
1016 p_line1
+= i_dest_margin
;
1017 p_line2
+= i_dest_margin
;