1 /*****************************************************************************
2 * i420_yuy2.c : YUV to YUV conversion module for vlc
3 *****************************************************************************
4 * Copyright (C) 2000, 2001 the VideoLAN team
7 * Authors: Samuel Hocevar <sam@zoy.org>
8 * Damien Fouilleul <damien@videolan.org>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
23 *****************************************************************************/
25 /*****************************************************************************
27 *****************************************************************************/
33 #include <vlc_common.h>
34 #include <vlc_plugin.h>
35 #include <vlc_filter.h>
37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
41 #include "i420_yuy2.h"
43 #define SRC_FOURCC "I420,IYUV,YV12"
45 #if defined (MODULE_NAME_IS_i420_yuy2)
46 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
48 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
50 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
52 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
55 /*****************************************************************************
56 * Local and extern prototypes.
57 *****************************************************************************/
58 static int Activate ( vlc_object_t
* );
60 static void I420_YUY2 ( filter_t
*, picture_t
*, picture_t
* );
61 static void I420_YVYU ( filter_t
*, picture_t
*, picture_t
* );
62 static void I420_UYVY ( filter_t
*, picture_t
*, picture_t
* );
63 static picture_t
*I420_YUY2_Filter ( filter_t
*, picture_t
* );
64 static picture_t
*I420_YVYU_Filter ( filter_t
*, picture_t
* );
65 static picture_t
*I420_UYVY_Filter ( filter_t
*, picture_t
* );
66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
67 static void I420_IUYV ( filter_t
*, picture_t
*, picture_t
* );
68 static void I420_cyuv ( filter_t
*, picture_t
*, picture_t
* );
69 static picture_t
*I420_IUYV_Filter ( filter_t
*, picture_t
* );
70 static picture_t
*I420_cyuv_Filter ( filter_t
*, picture_t
* );
72 #if defined (MODULE_NAME_IS_i420_yuy2)
73 static void I420_Y211 ( filter_t
*, picture_t
*, picture_t
* );
74 static picture_t
*I420_Y211_Filter ( filter_t
*, picture_t
* );
77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
78 /* Initialize MMX-specific constants */
79 static const uint64_t i_00ffw
= 0x00ff00ff00ff00ffULL
;
80 static const uint64_t i_80w
= 0x0000000080808080ULL
;
83 /*****************************************************************************
85 *****************************************************************************/
87 #if defined (MODULE_NAME_IS_i420_yuy2)
88 set_description( N_("Conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
89 set_capability( "video filter2", 80 )
90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
91 set_description( N_("MMX conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
92 set_capability( "video filter2", 160 )
93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
94 set_description( N_("SSE2 conversions from " SRC_FOURCC
" to " DEST_FOURCC
) )
95 set_capability( "video filter2", 250 )
96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
98 _("AltiVec conversions from " SRC_FOURCC
" to " DEST_FOURCC
) );
99 set_capability( "video filter2", 250 )
101 set_callbacks( Activate
, NULL
)
104 /*****************************************************************************
105 * Activate: allocate a chroma function
106 *****************************************************************************
107 * This function allocates and initializes a chroma function
108 *****************************************************************************/
109 static int Activate( vlc_object_t
*p_this
)
111 filter_t
*p_filter
= (filter_t
*)p_this
;
113 if( p_filter
->fmt_in
.video
.i_width
& 1
114 || p_filter
->fmt_in
.video
.i_height
& 1 )
119 if( p_filter
->fmt_in
.video
.i_width
!= p_filter
->fmt_out
.video
.i_width
120 || p_filter
->fmt_in
.video
.i_height
!= p_filter
->fmt_out
.video
.i_height
)
123 switch( p_filter
->fmt_in
.video
.i_chroma
)
127 switch( p_filter
->fmt_out
.video
.i_chroma
)
130 p_filter
->pf_video_filter
= I420_YUY2_Filter
;
134 p_filter
->pf_video_filter
= I420_YVYU_Filter
;
138 p_filter
->pf_video_filter
= I420_UYVY_Filter
;
140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
141 case VLC_FOURCC('I','U','Y','V'):
142 p_filter
->pf_video_filter
= I420_IUYV_Filter
;
146 p_filter
->pf_video_filter
= I420_cyuv_Filter
;
150 #if defined (MODULE_NAME_IS_i420_yuy2)
152 p_filter
->pf_video_filter
= I420_Y211_Filter
;
169 static inline unsigned long long read_cycles(void)
171 unsigned long long v
;
172 __asm__
__volatile__("rdtsc" : "=A" (v
): );
178 /* Following functions are local */
180 VIDEO_FILTER_WRAPPER( I420_YUY2
)
181 VIDEO_FILTER_WRAPPER( I420_YVYU
)
182 VIDEO_FILTER_WRAPPER( I420_UYVY
)
183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
184 VIDEO_FILTER_WRAPPER( I420_IUYV
)
185 VIDEO_FILTER_WRAPPER( I420_cyuv
)
187 #if defined (MODULE_NAME_IS_i420_yuy2)
188 VIDEO_FILTER_WRAPPER( I420_Y211
)
191 /*****************************************************************************
192 * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
193 *****************************************************************************/
194 static void I420_YUY2( filter_t
*p_filter
, picture_t
*p_source
,
197 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
198 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
199 uint8_t *p_u
= p_source
->U_PIXELS
;
200 uint8_t *p_v
= p_source
->V_PIXELS
;
204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
205 #define VEC_NEXT_LINES( ) \
207 p_line2 += p_dest->p->i_pitch; \
209 p_y2 += p_source->p[Y_PLANE].i_pitch;
211 #define VEC_LOAD_UV( ) \
212 u_vec = vec_ld( 0, p_u ); p_u += 16; \
213 v_vec = vec_ld( 0, p_v ); p_v += 16;
215 #define VEC_MERGE( a ) \
216 uv_vec = a( u_vec, v_vec ); \
217 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
218 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
219 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
220 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
221 vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
222 vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
224 vector
unsigned char u_vec
;
225 vector
unsigned char v_vec
;
226 vector
unsigned char uv_vec
;
227 vector
unsigned char y_vec
;
229 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
230 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
232 /* Width is a multiple of 32, we take 2 lines at a time */
233 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
236 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
239 VEC_MERGE( vec_mergeh
);
240 VEC_MERGE( vec_mergel
);
244 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
245 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
247 /* Width is only a multiple of 16, we take 4 lines at a time */
248 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
250 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
252 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
255 VEC_MERGE( vec_mergeh
);
256 VEC_MERGE( vec_mergel
);
259 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
261 VEC_MERGE( vec_mergeh
);
263 /* Line 3 and 4, pixels 0 to 16 */
265 VEC_MERGE( vec_mergel
);
267 /* Line 3 and 4, pixels 16 to ( width ) */
268 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
271 VEC_MERGE( vec_mergeh
);
272 VEC_MERGE( vec_mergel
);
278 /* Crap, use the C version */
279 #undef VEC_NEXT_LINES
284 const int i_source_margin
= p_source
->p
[0].i_pitch
285 - p_source
->p
[0].i_visible_pitch
;
286 const int i_source_margin_c
= p_source
->p
[1].i_pitch
287 - p_source
->p
[1].i_visible_pitch
;
288 const int i_dest_margin
= p_dest
->p
->i_pitch
289 - p_dest
->p
->i_visible_pitch
;
291 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
292 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
295 p_line2
+= p_dest
->p
->i_pitch
;
298 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
300 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
301 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8; i_x
-- ; )
309 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
311 MMX_CALL( MMX_YUV420_YUYV
);
314 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
319 p_y1
+= i_source_margin
;
320 p_y2
+= i_source_margin
;
321 p_u
+= i_source_margin_c
;
322 p_v
+= i_source_margin_c
;
323 p_line1
+= i_dest_margin
;
324 p_line2
+= i_dest_margin
;
327 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
328 /* re-enable FPU registers */
332 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
336 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
338 ** SSE2 128 bits fetch/store instructions are faster
339 ** if memory access is 16 bytes aligned
342 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
343 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
345 /* use faster SSE2 aligned fetch and store */
346 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
349 p_line2
+= p_dest
->p
->i_pitch
;
352 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
354 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
356 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED
);
358 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
363 p_y1
+= i_source_margin
;
364 p_y2
+= i_source_margin
;
365 p_u
+= i_source_margin_c
;
366 p_v
+= i_source_margin_c
;
367 p_line1
+= i_dest_margin
;
368 p_line2
+= i_dest_margin
;
373 /* use slower SSE2 unaligned fetch and store */
374 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
377 p_line2
+= p_dest
->p
->i_pitch
;
380 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
382 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
384 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED
);
386 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
391 p_y1
+= i_source_margin
;
392 p_y2
+= i_source_margin
;
393 p_u
+= i_source_margin_c
;
394 p_v
+= i_source_margin_c
;
395 p_line1
+= i_dest_margin
;
396 p_line2
+= i_dest_margin
;
399 /* make sure all SSE2 stores are visible thereafter */
402 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
405 /*****************************************************************************
406 * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
407 *****************************************************************************/
408 static void I420_YVYU( filter_t
*p_filter
, picture_t
*p_source
,
411 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
412 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
413 uint8_t *p_u
= p_source
->U_PIXELS
;
414 uint8_t *p_v
= p_source
->V_PIXELS
;
418 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
419 #define VEC_NEXT_LINES( ) \
421 p_line2 += p_dest->p->i_pitch; \
423 p_y2 += p_source->p[Y_PLANE].i_pitch;
425 #define VEC_LOAD_UV( ) \
426 u_vec = vec_ld( 0, p_u ); p_u += 16; \
427 v_vec = vec_ld( 0, p_v ); p_v += 16;
429 #define VEC_MERGE( a ) \
430 vu_vec = a( v_vec, u_vec ); \
431 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
432 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
433 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
434 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
435 vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
436 vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
438 vector
unsigned char u_vec
;
439 vector
unsigned char v_vec
;
440 vector
unsigned char vu_vec
;
441 vector
unsigned char y_vec
;
443 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
444 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
446 /* Width is a multiple of 32, we take 2 lines at a time */
447 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
450 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
453 VEC_MERGE( vec_mergeh
);
454 VEC_MERGE( vec_mergel
);
458 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
459 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
461 /* Width is only a multiple of 16, we take 4 lines at a time */
462 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
464 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
466 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
469 VEC_MERGE( vec_mergeh
);
470 VEC_MERGE( vec_mergel
);
473 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
475 VEC_MERGE( vec_mergeh
);
477 /* Line 3 and 4, pixels 0 to 16 */
479 VEC_MERGE( vec_mergel
);
481 /* Line 3 and 4, pixels 16 to ( width ) */
482 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
485 VEC_MERGE( vec_mergeh
);
486 VEC_MERGE( vec_mergel
);
492 /* Crap, use the C version */
493 #undef VEC_NEXT_LINES
498 const int i_source_margin
= p_source
->p
[0].i_pitch
499 - p_source
->p
[0].i_visible_pitch
;
500 const int i_source_margin_c
= p_source
->p
[1].i_pitch
501 - p_source
->p
[1].i_visible_pitch
;
502 const int i_dest_margin
= p_dest
->p
->i_pitch
503 - p_dest
->p
->i_visible_pitch
;
505 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
506 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
509 p_line2
+= p_dest
->p
->i_pitch
;
512 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
514 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
516 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
522 MMX_CALL( MMX_YUV420_YVYU
);
525 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
530 p_y1
+= i_source_margin
;
531 p_y2
+= i_source_margin
;
532 p_u
+= i_source_margin_c
;
533 p_v
+= i_source_margin_c
;
534 p_line1
+= i_dest_margin
;
535 p_line2
+= i_dest_margin
;
538 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
539 /* re-enable FPU registers */
543 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
547 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
549 ** SSE2 128 bits fetch/store instructions are faster
550 ** if memory access is 16 bytes aligned
552 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
553 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
555 /* use faster SSE2 aligned fetch and store */
556 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
559 p_line2
+= p_dest
->p
->i_pitch
;
562 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
564 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
566 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED
);
568 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
573 p_y1
+= i_source_margin
;
574 p_y2
+= i_source_margin
;
575 p_u
+= i_source_margin_c
;
576 p_v
+= i_source_margin_c
;
577 p_line1
+= i_dest_margin
;
578 p_line2
+= i_dest_margin
;
583 /* use slower SSE2 unaligned fetch and store */
584 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
587 p_line2
+= p_dest
->p
->i_pitch
;
590 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
592 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
594 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED
);
596 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
601 p_y1
+= i_source_margin
;
602 p_y2
+= i_source_margin
;
603 p_u
+= i_source_margin_c
;
604 p_v
+= i_source_margin_c
;
605 p_line1
+= i_dest_margin
;
606 p_line2
+= i_dest_margin
;
609 /* make sure all SSE2 stores are visible thereafter */
611 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
614 /*****************************************************************************
615 * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
616 *****************************************************************************/
617 static void I420_UYVY( filter_t
*p_filter
, picture_t
*p_source
,
620 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
621 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
622 uint8_t *p_u
= p_source
->U_PIXELS
;
623 uint8_t *p_v
= p_source
->V_PIXELS
;
627 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
628 #define VEC_NEXT_LINES( ) \
630 p_line2 += p_dest->p->i_pitch; \
632 p_y2 += p_source->p[Y_PLANE].i_pitch;
634 #define VEC_LOAD_UV( ) \
635 u_vec = vec_ld( 0, p_u ); p_u += 16; \
636 v_vec = vec_ld( 0, p_v ); p_v += 16;
638 #define VEC_MERGE( a ) \
639 uv_vec = a( u_vec, v_vec ); \
640 y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
641 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
642 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
643 y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
644 vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
645 vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
647 vector
unsigned char u_vec
;
648 vector
unsigned char v_vec
;
649 vector
unsigned char uv_vec
;
650 vector
unsigned char y_vec
;
652 if( !( ( p_filter
->fmt_in
.video
.i_width
% 32 ) |
653 ( p_filter
->fmt_in
.video
.i_height
% 2 ) ) )
655 /* Width is a multiple of 32, we take 2 lines at a time */
656 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
659 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
662 VEC_MERGE( vec_mergeh
);
663 VEC_MERGE( vec_mergel
);
667 else if( !( ( p_filter
->fmt_in
.video
.i_width
% 16 ) |
668 ( p_filter
->fmt_in
.video
.i_height
% 4 ) ) )
670 /* Width is only a multiple of 16, we take 4 lines at a time */
671 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 4 ; i_y
-- ; )
673 /* Line 1 and 2, pixels 0 to ( width - 16 ) */
675 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
678 VEC_MERGE( vec_mergeh
);
679 VEC_MERGE( vec_mergel
);
682 /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
684 VEC_MERGE( vec_mergeh
);
686 /* Line 3 and 4, pixels 0 to 16 */
688 VEC_MERGE( vec_mergel
);
690 /* Line 3 and 4, pixels 16 to ( width ) */
691 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 32 ; i_x
-- ; )
694 VEC_MERGE( vec_mergeh
);
695 VEC_MERGE( vec_mergel
);
701 /* Crap, use the C version */
702 #undef VEC_NEXT_LINES
707 const int i_source_margin
= p_source
->p
[0].i_pitch
708 - p_source
->p
[0].i_visible_pitch
;
709 const int i_source_margin_c
= p_source
->p
[1].i_pitch
710 - p_source
->p
[1].i_visible_pitch
;
711 const int i_dest_margin
= p_dest
->p
->i_pitch
712 - p_dest
->p
->i_visible_pitch
;
714 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
715 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
718 p_line2
+= p_dest
->p
->i_pitch
;
721 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
723 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
725 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
731 MMX_CALL( MMX_YUV420_UYVY
);
734 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
--; )
739 p_y1
+= i_source_margin
;
740 p_y2
+= i_source_margin
;
741 p_u
+= i_source_margin_c
;
742 p_v
+= i_source_margin_c
;
743 p_line1
+= i_dest_margin
;
744 p_line2
+= i_dest_margin
;
747 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
748 /* re-enable FPU registers */
752 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
756 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
758 ** SSE2 128 bits fetch/store instructions are faster
759 ** if memory access is 16 bytes aligned
761 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
762 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
764 /* use faster SSE2 aligned fetch and store */
765 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
768 p_line2
+= p_dest
->p
->i_pitch
;
771 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
773 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
775 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED
);
777 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
782 p_y1
+= i_source_margin
;
783 p_y2
+= i_source_margin
;
784 p_u
+= i_source_margin_c
;
785 p_v
+= i_source_margin_c
;
786 p_line1
+= i_dest_margin
;
787 p_line2
+= i_dest_margin
;
792 /* use slower SSE2 unaligned fetch and store */
793 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
796 p_line2
+= p_dest
->p
->i_pitch
;
799 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
801 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
803 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED
);
805 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
810 p_y1
+= i_source_margin
;
811 p_y2
+= i_source_margin
;
812 p_u
+= i_source_margin_c
;
813 p_v
+= i_source_margin_c
;
814 p_line1
+= i_dest_margin
;
815 p_line2
+= i_dest_margin
;
818 /* make sure all SSE2 stores are visible thereafter */
820 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
823 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
824 /*****************************************************************************
825 * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
826 *****************************************************************************/
827 static void I420_IUYV( filter_t
*p_filter
, picture_t
*p_source
,
830 VLC_UNUSED(p_source
); VLC_UNUSED(p_dest
);
832 msg_Err( p_filter
, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
835 /*****************************************************************************
836 * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
837 *****************************************************************************/
838 static void I420_cyuv( filter_t
*p_filter
, picture_t
*p_source
,
841 uint8_t *p_line1
= p_dest
->p
->p_pixels
+
842 p_dest
->p
->i_visible_lines
* p_dest
->p
->i_pitch
843 + p_dest
->p
->i_pitch
;
844 uint8_t *p_line2
= p_dest
->p
->p_pixels
+
845 p_dest
->p
->i_visible_lines
* p_dest
->p
->i_pitch
;
846 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
847 uint8_t *p_u
= p_source
->U_PIXELS
;
848 uint8_t *p_v
= p_source
->V_PIXELS
;
852 const int i_source_margin
= p_source
->p
[0].i_pitch
853 - p_source
->p
[0].i_visible_pitch
;
854 const int i_source_margin_c
= p_source
->p
[1].i_pitch
855 - p_source
->p
[1].i_visible_pitch
;
856 const int i_dest_margin
= p_dest
->p
->i_pitch
857 - p_dest
->p
->i_visible_pitch
;
859 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
860 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
862 p_line1
-= 3 * p_dest
->p
->i_pitch
;
863 p_line2
-= 3 * p_dest
->p
->i_pitch
;
866 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
868 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
870 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
876 MMX_CALL( MMX_YUV420_UYVY
);
879 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 8 ) / 2; i_x
-- ; )
884 p_y1
+= i_source_margin
;
885 p_y2
+= i_source_margin
;
886 p_u
+= i_source_margin_c
;
887 p_v
+= i_source_margin_c
;
888 p_line1
+= i_dest_margin
;
889 p_line2
+= i_dest_margin
;
892 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
893 /* re-enable FPU registers */
897 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
899 ** SSE2 128 bits fetch/store instructions are faster
900 ** if memory access is 16 bytes aligned
902 if( 0 == (15 & (p_source
->p
[Y_PLANE
].i_pitch
|p_dest
->p
->i_pitch
|
903 ((intptr_t)p_line2
|(intptr_t)p_y2
))) )
905 /* use faster SSE2 aligned fetch and store */
906 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
909 p_line2
+= p_dest
->p
->i_pitch
;
912 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
914 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
916 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED
);
918 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
923 p_y1
+= i_source_margin
;
924 p_y2
+= i_source_margin
;
925 p_u
+= i_source_margin_c
;
926 p_v
+= i_source_margin_c
;
927 p_line1
+= i_dest_margin
;
928 p_line2
+= i_dest_margin
;
933 /* use slower SSE2 unaligned fetch and store */
934 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
937 p_line2
+= p_dest
->p
->i_pitch
;
940 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
942 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 16 ; i_x
-- ; )
944 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED
);
946 for( i_x
= ( p_filter
->fmt_in
.video
.i_width
% 16 ) / 2; i_x
-- ; )
951 p_y1
+= i_source_margin
;
952 p_y2
+= i_source_margin
;
953 p_u
+= i_source_margin_c
;
954 p_v
+= i_source_margin_c
;
955 p_line1
+= i_dest_margin
;
956 p_line2
+= i_dest_margin
;
959 /* make sure all SSE2 stores are visible thereafter */
961 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
963 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
965 /*****************************************************************************
966 * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
967 *****************************************************************************/
968 #if defined (MODULE_NAME_IS_i420_yuy2)
969 static void I420_Y211( filter_t
*p_filter
, picture_t
*p_source
,
972 uint8_t *p_line1
, *p_line2
= p_dest
->p
->p_pixels
;
973 uint8_t *p_y1
, *p_y2
= p_source
->Y_PIXELS
;
974 uint8_t *p_u
= p_source
->U_PIXELS
;
975 uint8_t *p_v
= p_source
->V_PIXELS
;
979 const int i_source_margin
= p_source
->p
[0].i_pitch
980 - p_source
->p
[0].i_visible_pitch
;
981 const int i_source_margin_c
= p_source
->p
[1].i_pitch
982 - p_source
->p
[1].i_visible_pitch
;
983 const int i_dest_margin
= p_dest
->p
->i_pitch
984 - p_dest
->p
->i_visible_pitch
;
986 for( i_y
= p_filter
->fmt_in
.video
.i_height
/ 2 ; i_y
-- ; )
989 p_line2
+= p_dest
->p
->i_pitch
;
992 p_y2
+= p_source
->p
[Y_PLANE
].i_pitch
;
994 for( i_x
= p_filter
->fmt_in
.video
.i_width
/ 8 ; i_x
-- ; )
1000 p_y1
+= i_source_margin
;
1001 p_y2
+= i_source_margin
;
1002 p_u
+= i_source_margin_c
;
1003 p_v
+= i_source_margin_c
;
1004 p_line1
+= i_dest_margin
;
1005 p_line2
+= i_dest_margin
;