modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_filter.h>
  36
  37 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  38 #   include <altivec.h>
  39 #endif
  40
  41 #include "i420_yuy2.h"
  42
  43 #define SRC_FOURCC  "I420,IYUV,YV12"
  44
  45 #if defined (MODULE_NAME_IS_i420_yuy2)
  46 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  47 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  48 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  49 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  50 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  51 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  52 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  53 #endif
  54
  55 /*****************************************************************************
  56  * Local and extern prototypes.
  57  *****************************************************************************/
  58 static int  Activate ( vlc_object_t * );
  59
  60 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  61 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  62 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  63 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  64 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  65 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  66 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  67 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  68 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  69 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  70 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  71 #endif
  72 #if defined (MODULE_NAME_IS_i420_yuy2)
  73 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  74 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  75 #endif
  76
  77 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  78 /* Initialize MMX-specific constants */
  79 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  80 static const uint64_t i_80w   = 0x0000000080808080ULL;
  81 #endif
  82
  83 /*****************************************************************************
  84  * Module descriptor.
  85  *****************************************************************************/
  86 vlc_module_begin ()
  87 #if defined (MODULE_NAME_IS_i420_yuy2)
  88     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  89     set_capability( "video filter2", 80 )
  90 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  91     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  92     set_capability( "video filter2", 160 )
  93 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  94     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  95     set_capability( "video filter2", 250 )
  96 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  97     set_description(
  98             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
  99     set_capability( "video filter2", 250 )
 100 #endif
 101     set_callbacks( Activate, NULL )
 102 vlc_module_end ()
 103
 104 /*****************************************************************************
 105  * Activate: allocate a chroma function
 106  *****************************************************************************
 107  * This function allocates and initializes a chroma function
 108  *****************************************************************************/
 109 static int Activate( vlc_object_t *p_this )
 110 {
 111     filter_t *p_filter = (filter_t *)p_this;
 112
 113     if( p_filter->fmt_in.video.i_width & 1
 114      || p_filter->fmt_in.video.i_height & 1 )
 115     {
 116         return -1;
 117     }
 118
 119     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
 120      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
 121         return -1;
 122
 123     switch( p_filter->fmt_in.video.i_chroma )
 124     {
 125         case VLC_CODEC_YV12:
 126         case VLC_CODEC_I420:
 127             switch( p_filter->fmt_out.video.i_chroma )
 128             {
 129                 case VLC_CODEC_YUYV:
 130                     p_filter->pf_video_filter = I420_YUY2_Filter;
 131                     break;
 132
 133                 case VLC_CODEC_YVYU:
 134                     p_filter->pf_video_filter = I420_YVYU_Filter;
 135                     break;
 136
 137                 case VLC_CODEC_UYVY:
 138                     p_filter->pf_video_filter = I420_UYVY_Filter;
 139                     break;
 140 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 141                 case VLC_FOURCC('I','U','Y','V'):
 142                     p_filter->pf_video_filter = I420_IUYV_Filter;
 143                     break;
 144
 145                 case VLC_CODEC_CYUV:
 146                     p_filter->pf_video_filter = I420_cyuv_Filter;
 147                     break;
 148 #endif
 149
 150 #if defined (MODULE_NAME_IS_i420_yuy2)
 151                 case VLC_CODEC_Y211:
 152                     p_filter->pf_video_filter = I420_Y211_Filter;
 153                     break;
 154 #endif
 155
 156                 default:
 157                     return -1;
 158             }
 159             break;
 160
 161         default:
 162             return -1;
 163     }
 164
 165     return 0;
 166 }
 167
 168 #if 0
 169 static inline unsigned long long read_cycles(void)
 170 {
 171     unsigned long long v;
 172     __asm__ __volatile__("rdtsc" : "=A" (v): );
 173
 174     return v;
 175 }
 176 #endif
 177
 178 /* Following functions are local */
 179
 180 VIDEO_FILTER_WRAPPER( I420_YUY2 )
 181 VIDEO_FILTER_WRAPPER( I420_YVYU )
 182 VIDEO_FILTER_WRAPPER( I420_UYVY )
 183 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 184 VIDEO_FILTER_WRAPPER( I420_IUYV )
 185 VIDEO_FILTER_WRAPPER( I420_cyuv )
 186 #endif
 187 #if defined (MODULE_NAME_IS_i420_yuy2)
 188 VIDEO_FILTER_WRAPPER( I420_Y211 )
 189 #endif
 190
 191 /*****************************************************************************
 192  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 193  *****************************************************************************/
 194 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
 195                                            picture_t *p_dest )
 196 {
 197     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 198     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 199     uint8_t *p_u = p_source->U_PIXELS;
 200     uint8_t *p_v = p_source->V_PIXELS;
 201
 202     int i_x, i_y;
 203
 204 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 205 #define VEC_NEXT_LINES( ) \
 206     p_line1  = p_line2; \
 207     p_line2 += p_dest->p->i_pitch; \
 208     p_y1     = p_y2; \
 209     p_y2    += p_source->p[Y_PLANE].i_pitch;
 210
 211 #define VEC_LOAD_UV( ) \
 212     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 213     v_vec = vec_ld( 0, p_v ); p_v += 16;
 214
 215 #define VEC_MERGE( a ) \
 216     uv_vec = a( u_vec, v_vec ); \
 217     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 218     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 219     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 220     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 221     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 222     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 223
 224     vector unsigned char u_vec;
 225     vector unsigned char v_vec;
 226     vector unsigned char uv_vec;
 227     vector unsigned char y_vec;
 228
 229     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 230            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 231     {
 232         /* Width is a multiple of 32, we take 2 lines at a time */
 233         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 234         {
 235             VEC_NEXT_LINES( );
 236             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 237             {
 238                 VEC_LOAD_UV( );
 239                 VEC_MERGE( vec_mergeh );
 240                 VEC_MERGE( vec_mergel );
 241             }
 242         }
 243     }
 244 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
 245 #if 0
 246     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 247                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 248     {
 249         /* Width is only a multiple of 16, we take 4 lines at a time */
 250         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 251         {
 252             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 253             VEC_NEXT_LINES( );
 254             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 255             {
 256                 VEC_LOAD_UV( );
 257                 VEC_MERGE( vec_mergeh );
 258                 VEC_MERGE( vec_mergel );
 259             }
 260
 261             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 262             VEC_LOAD_UV( );
 263             VEC_MERGE( vec_mergeh );
 264
 265             /* Line 3 and 4, pixels 0 to 16 */
 266             VEC_NEXT_LINES( );
 267             VEC_MERGE( vec_mergel );
 268
 269             /* Line 3 and 4, pixels 16 to ( width ) */
 270             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 271             {
 272                 VEC_LOAD_UV( );
 273                 VEC_MERGE( vec_mergeh );
 274                 VEC_MERGE( vec_mergel );
 275             }
 276         }
 277     }
 278 #endif
 279     else
 280     {
 281         /* Crap, use the C version */
 282 #undef VEC_NEXT_LINES
 283 #undef VEC_LOAD_UV
 284 #undef VEC_MERGE
 285 #endif
 286
 287     const int i_source_margin = p_source->p[0].i_pitch
 288                                  - p_source->p[0].i_visible_pitch;
 289     const int i_source_margin_c = p_source->p[1].i_pitch
 290                                  - p_source->p[1].i_visible_pitch;
 291     const int i_dest_margin = p_dest->p->i_pitch
 292                                - p_dest->p->i_visible_pitch;
 293
 294 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 295     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 296     {
 297         p_line1 = p_line2;
 298         p_line2 += p_dest->p->i_pitch;
 299
 300         p_y1 = p_y2;
 301         p_y2 += p_source->p[Y_PLANE].i_pitch;
 302
 303 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 304         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
 305         {
 306             C_YUV420_YUYV( );
 307             C_YUV420_YUYV( );
 308             C_YUV420_YUYV( );
 309             C_YUV420_YUYV( );
 310         }
 311 #else
 312         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 313         {
 314             MMX_CALL( MMX_YUV420_YUYV );
 315         }
 316 #endif
 317         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 318         {
 319             C_YUV420_YUYV( );
 320         }
 321
 322         p_y1 += i_source_margin;
 323         p_y2 += i_source_margin;
 324         p_u += i_source_margin_c;
 325         p_v += i_source_margin_c;
 326         p_line1 += i_dest_margin;
 327         p_line2 += i_dest_margin;
 328     }
 329
 330 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 331     /* re-enable FPU registers */
 332     MMX_END;
 333 #endif
 334
 335 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 336     }
 337 #endif
 338
 339 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 340     /*
 341     ** SSE2 128 bits fetch/store instructions are faster
 342     ** if memory access is 16 bytes aligned
 343     */
 344
 345     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 346         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 347     {
 348         /* use faster SSE2 aligned fetch and store */
 349         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 350         {
 351             p_line1 = p_line2;
 352             p_line2 += p_dest->p->i_pitch;
 353
 354             p_y1 = p_y2;
 355             p_y2 += p_source->p[Y_PLANE].i_pitch;
 356
 357             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 358             {
 359                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 360             }
 361             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 362             {
 363                 C_YUV420_YUYV( );
 364             }
 365
 366             p_y1 += i_source_margin;
 367             p_y2 += i_source_margin;
 368             p_u += i_source_margin_c;
 369             p_v += i_source_margin_c;
 370             p_line1 += i_dest_margin;
 371             p_line2 += i_dest_margin;
 372         }
 373     }
 374     else
 375     {
 376         /* use slower SSE2 unaligned fetch and store */
 377         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 378         {
 379             p_line1 = p_line2;
 380             p_line2 += p_dest->p->i_pitch;
 381
 382             p_y1 = p_y2;
 383             p_y2 += p_source->p[Y_PLANE].i_pitch;
 384
 385             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 386             {
 387                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 388             }
 389             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 390             {
 391                 C_YUV420_YUYV( );
 392             }
 393
 394             p_y1 += i_source_margin;
 395             p_y2 += i_source_margin;
 396             p_u += i_source_margin_c;
 397             p_v += i_source_margin_c;
 398             p_line1 += i_dest_margin;
 399             p_line2 += i_dest_margin;
 400         }
 401     }
 402     /* make sure all SSE2 stores are visible thereafter */
 403     SSE2_END;
 404
 405 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 406 }
 407
 408 /*****************************************************************************
 409  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 410  *****************************************************************************/
 411 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
 412                                            picture_t *p_dest )
 413 {
 414     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 415     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 416     uint8_t *p_u = p_source->U_PIXELS;
 417     uint8_t *p_v = p_source->V_PIXELS;
 418
 419     int i_x, i_y;
 420
 421 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 422 #define VEC_NEXT_LINES( ) \
 423     p_line1  = p_line2; \
 424     p_line2 += p_dest->p->i_pitch; \
 425     p_y1     = p_y2; \
 426     p_y2    += p_source->p[Y_PLANE].i_pitch;
 427
 428 #define VEC_LOAD_UV( ) \
 429     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 430     v_vec = vec_ld( 0, p_v ); p_v += 16;
 431
 432 #define VEC_MERGE( a ) \
 433     vu_vec = a( v_vec, u_vec ); \
 434     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 435     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 436     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 437     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 438     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 439     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 440
 441     vector unsigned char u_vec;
 442     vector unsigned char v_vec;
 443     vector unsigned char vu_vec;
 444     vector unsigned char y_vec;
 445
 446     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 447            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 448     {
 449         /* Width is a multiple of 32, we take 2 lines at a time */
 450         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 451         {
 452             VEC_NEXT_LINES( );
 453             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 454             {
 455                 VEC_LOAD_UV( );
 456                 VEC_MERGE( vec_mergeh );
 457                 VEC_MERGE( vec_mergel );
 458             }
 459         }
 460     }
 461     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 462                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 463     {
 464         /* Width is only a multiple of 16, we take 4 lines at a time */
 465         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 466         {
 467             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 468             VEC_NEXT_LINES( );
 469             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 470             {
 471                 VEC_LOAD_UV( );
 472                 VEC_MERGE( vec_mergeh );
 473                 VEC_MERGE( vec_mergel );
 474             }
 475
 476             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 477             VEC_LOAD_UV( );
 478             VEC_MERGE( vec_mergeh );
 479
 480             /* Line 3 and 4, pixels 0 to 16 */
 481             VEC_NEXT_LINES( );
 482             VEC_MERGE( vec_mergel );
 483
 484             /* Line 3 and 4, pixels 16 to ( width ) */
 485             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 486             {
 487                 VEC_LOAD_UV( );
 488                 VEC_MERGE( vec_mergeh );
 489                 VEC_MERGE( vec_mergel );
 490             }
 491         }
 492     }
 493     else
 494     {
 495         /* Crap, use the C version */
 496 #undef VEC_NEXT_LINES
 497 #undef VEC_LOAD_UV
 498 #undef VEC_MERGE
 499 #endif
 500
 501     const int i_source_margin = p_source->p[0].i_pitch
 502                                  - p_source->p[0].i_visible_pitch;
 503     const int i_source_margin_c = p_source->p[1].i_pitch
 504                                  - p_source->p[1].i_visible_pitch;
 505     const int i_dest_margin = p_dest->p->i_pitch
 506                                - p_dest->p->i_visible_pitch;
 507
 508 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 509     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 510     {
 511         p_line1 = p_line2;
 512         p_line2 += p_dest->p->i_pitch;
 513
 514         p_y1 = p_y2;
 515         p_y2 += p_source->p[Y_PLANE].i_pitch;
 516
 517         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 518         {
 519 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 520             C_YUV420_YVYU( );
 521             C_YUV420_YVYU( );
 522             C_YUV420_YVYU( );
 523             C_YUV420_YVYU( );
 524 #else
 525             MMX_CALL( MMX_YUV420_YVYU );
 526 #endif
 527         }
 528         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 529         {
 530             C_YUV420_YVYU( );
 531         }
 532
 533         p_y1 += i_source_margin;
 534         p_y2 += i_source_margin;
 535         p_u += i_source_margin_c;
 536         p_v += i_source_margin_c;
 537         p_line1 += i_dest_margin;
 538         p_line2 += i_dest_margin;
 539     }
 540
 541 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 542     /* re-enable FPU registers */
 543     MMX_END;
 544 #endif
 545
 546 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 547     }
 548 #endif
 549
 550 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 551     /*
 552     ** SSE2 128 bits fetch/store instructions are faster
 553     ** if memory access is 16 bytes aligned
 554     */
 555     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 556         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 557     {
 558         /* use faster SSE2 aligned fetch and store */
 559         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 560         {
 561             p_line1 = p_line2;
 562             p_line2 += p_dest->p->i_pitch;
 563
 564             p_y1 = p_y2;
 565             p_y2 += p_source->p[Y_PLANE].i_pitch;
 566
 567             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 568             {
 569                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 570             }
 571             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 572             {
 573                 C_YUV420_YVYU( );
 574             }
 575
 576             p_y1 += i_source_margin;
 577             p_y2 += i_source_margin;
 578             p_u += i_source_margin_c;
 579             p_v += i_source_margin_c;
 580             p_line1 += i_dest_margin;
 581             p_line2 += i_dest_margin;
 582         }
 583     }
 584     else
 585     {
 586         /* use slower SSE2 unaligned fetch and store */
 587         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 588         {
 589             p_line1 = p_line2;
 590             p_line2 += p_dest->p->i_pitch;
 591
 592             p_y1 = p_y2;
 593             p_y2 += p_source->p[Y_PLANE].i_pitch;
 594
 595             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 596             {
 597                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 598             }
 599             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 600             {
 601                 C_YUV420_YVYU( );
 602             }
 603
 604             p_y1 += i_source_margin;
 605             p_y2 += i_source_margin;
 606             p_u += i_source_margin_c;
 607             p_v += i_source_margin_c;
 608             p_line1 += i_dest_margin;
 609             p_line2 += i_dest_margin;
 610         }
 611     }
 612     /* make sure all SSE2 stores are visible thereafter */
 613     SSE2_END;
 614 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 615 }
 616
 617 /*****************************************************************************
 618  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 619  *****************************************************************************/
 620 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
 621                                            picture_t *p_dest )
 622 {
 623     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 624     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 625     uint8_t *p_u = p_source->U_PIXELS;
 626     uint8_t *p_v = p_source->V_PIXELS;
 627
 628     int i_x, i_y;
 629
 630 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 631 #define VEC_NEXT_LINES( ) \
 632     p_line1  = p_line2; \
 633     p_line2 += p_dest->p->i_pitch; \
 634     p_y1     = p_y2; \
 635     p_y2    += p_source->p[Y_PLANE].i_pitch;
 636
 637 #define VEC_LOAD_UV( ) \
 638     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 639     v_vec = vec_ld( 0, p_v ); p_v += 16;
 640
 641 #define VEC_MERGE( a ) \
 642     uv_vec = a( u_vec, v_vec ); \
 643     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 644     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 645     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 646     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 647     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 648     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 649
 650     vector unsigned char u_vec;
 651     vector unsigned char v_vec;
 652     vector unsigned char uv_vec;
 653     vector unsigned char y_vec;
 654
 655     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 656            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 657     {
 658         /* Width is a multiple of 32, we take 2 lines at a time */
 659         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 660         {
 661             VEC_NEXT_LINES( );
 662             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 663             {
 664                 VEC_LOAD_UV( );
 665                 VEC_MERGE( vec_mergeh );
 666                 VEC_MERGE( vec_mergel );
 667             }
 668         }
 669     }
 670     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 671                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 672     {
 673         /* Width is only a multiple of 16, we take 4 lines at a time */
 674         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 675         {
 676             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 677             VEC_NEXT_LINES( );
 678             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 679             {
 680                 VEC_LOAD_UV( );
 681                 VEC_MERGE( vec_mergeh );
 682                 VEC_MERGE( vec_mergel );
 683             }
 684
 685             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 686             VEC_LOAD_UV( );
 687             VEC_MERGE( vec_mergeh );
 688
 689             /* Line 3 and 4, pixels 0 to 16 */
 690             VEC_NEXT_LINES( );
 691             VEC_MERGE( vec_mergel );
 692
 693             /* Line 3 and 4, pixels 16 to ( width ) */
 694             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 695             {
 696                 VEC_LOAD_UV( );
 697                 VEC_MERGE( vec_mergeh );
 698                 VEC_MERGE( vec_mergel );
 699             }
 700         }
 701     }
 702     else
 703     {
 704         /* Crap, use the C version */
 705 #undef VEC_NEXT_LINES
 706 #undef VEC_LOAD_UV
 707 #undef VEC_MERGE
 708 #endif
 709
 710     const int i_source_margin = p_source->p[0].i_pitch
 711                                  - p_source->p[0].i_visible_pitch;
 712     const int i_source_margin_c = p_source->p[1].i_pitch
 713                                  - p_source->p[1].i_visible_pitch;
 714     const int i_dest_margin = p_dest->p->i_pitch
 715                                - p_dest->p->i_visible_pitch;
 716
 717 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 718     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 719     {
 720         p_line1 = p_line2;
 721         p_line2 += p_dest->p->i_pitch;
 722
 723         p_y1 = p_y2;
 724         p_y2 += p_source->p[Y_PLANE].i_pitch;
 725
 726         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 727         {
 728 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 729             C_YUV420_UYVY( );
 730             C_YUV420_UYVY( );
 731             C_YUV420_UYVY( );
 732             C_YUV420_UYVY( );
 733 #else
 734             MMX_CALL( MMX_YUV420_UYVY );
 735 #endif
 736         }
 737         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
 738         {
 739             C_YUV420_UYVY( );
 740         }
 741
 742         p_y1 += i_source_margin;
 743         p_y2 += i_source_margin;
 744         p_u += i_source_margin_c;
 745         p_v += i_source_margin_c;
 746         p_line1 += i_dest_margin;
 747         p_line2 += i_dest_margin;
 748     }
 749
 750 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 751     /* re-enable FPU registers */
 752     MMX_END;
 753 #endif
 754
 755 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 756     }
 757 #endif
 758
 759 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 760     /*
 761     ** SSE2 128 bits fetch/store instructions are faster
 762     ** if memory access is 16 bytes aligned
 763     */
 764     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 765         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 766     {
 767         /* use faster SSE2 aligned fetch and store */
 768         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 769         {
 770             p_line1 = p_line2;
 771             p_line2 += p_dest->p->i_pitch;
 772
 773             p_y1 = p_y2;
 774             p_y2 += p_source->p[Y_PLANE].i_pitch;
 775
 776             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 777             {
 778                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 779             }
 780             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 781             {
 782                 C_YUV420_UYVY( );
 783             }
 784
 785             p_y1 += i_source_margin;
 786             p_y2 += i_source_margin;
 787             p_u += i_source_margin_c;
 788             p_v += i_source_margin_c;
 789             p_line1 += i_dest_margin;
 790             p_line2 += i_dest_margin;
 791         }
 792     }
 793     else
 794     {
 795         /* use slower SSE2 unaligned fetch and store */
 796         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 797         {
 798             p_line1 = p_line2;
 799             p_line2 += p_dest->p->i_pitch;
 800
 801             p_y1 = p_y2;
 802             p_y2 += p_source->p[Y_PLANE].i_pitch;
 803
 804             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 805             {
 806                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 807             }
 808             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 809             {
 810                 C_YUV420_UYVY( );
 811             }
 812
 813             p_y1 += i_source_margin;
 814             p_y2 += i_source_margin;
 815             p_u += i_source_margin_c;
 816             p_v += i_source_margin_c;
 817             p_line1 += i_dest_margin;
 818             p_line2 += i_dest_margin;
 819         }
 820     }
 821     /* make sure all SSE2 stores are visible thereafter */
 822     SSE2_END;
 823 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 824 }
 825
 826 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 827 /*****************************************************************************
 828  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 829  *****************************************************************************/
 830 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
 831                                            picture_t *p_dest )
 832 {
 833     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 834     /* FIXME: TODO ! */
 835     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 836 }
 837
 838 /*****************************************************************************
 839  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 840  *****************************************************************************/
 841 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
 842                                            picture_t *p_dest )
 843 {
 844     uint8_t *p_line1 = p_dest->p->p_pixels +
 845                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 846                        + p_dest->p->i_pitch;
 847     uint8_t *p_line2 = p_dest->p->p_pixels +
 848                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 849     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 850     uint8_t *p_u = p_source->U_PIXELS;
 851     uint8_t *p_v = p_source->V_PIXELS;
 852
 853     int i_x, i_y;
 854
 855     const int i_source_margin = p_source->p[0].i_pitch
 856                                  - p_source->p[0].i_visible_pitch;
 857     const int i_source_margin_c = p_source->p[1].i_pitch
 858                                  - p_source->p[1].i_visible_pitch;
 859     const int i_dest_margin = p_dest->p->i_pitch
 860                                - p_dest->p->i_visible_pitch;
 861
 862 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 863     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 864     {
 865         p_line1 -= 3 * p_dest->p->i_pitch;
 866         p_line2 -= 3 * p_dest->p->i_pitch;
 867
 868         p_y1 = p_y2;
 869         p_y2 += p_source->p[Y_PLANE].i_pitch;
 870
 871         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 872         {
 873 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 874             C_YUV420_UYVY( );
 875             C_YUV420_UYVY( );
 876             C_YUV420_UYVY( );
 877             C_YUV420_UYVY( );
 878 #else
 879             MMX_CALL( MMX_YUV420_UYVY );
 880 #endif
 881         }
 882         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 883         {
 884             C_YUV420_UYVY( );
 885         }
 886
 887         p_y1 += i_source_margin;
 888         p_y2 += i_source_margin;
 889         p_u += i_source_margin_c;
 890         p_v += i_source_margin_c;
 891         p_line1 += i_dest_margin;
 892         p_line2 += i_dest_margin;
 893     }
 894
 895 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 896     /* re-enable FPU registers */
 897     MMX_END;
 898 #endif
 899
 900 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 901     /*
 902     ** SSE2 128 bits fetch/store instructions are faster
 903     ** if memory access is 16 bytes aligned
 904     */
 905     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 906         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 907     {
 908         /* use faster SSE2 aligned fetch and store */
 909         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 910         {
 911             p_line1 = p_line2;
 912             p_line2 += p_dest->p->i_pitch;
 913
 914             p_y1 = p_y2;
 915             p_y2 += p_source->p[Y_PLANE].i_pitch;
 916
 917             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 918             {
 919                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 920             }
 921             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 922             {
 923                 C_YUV420_UYVY( );
 924             }
 925
 926             p_y1 += i_source_margin;
 927             p_y2 += i_source_margin;
 928             p_u += i_source_margin_c;
 929             p_v += i_source_margin_c;
 930             p_line1 += i_dest_margin;
 931             p_line2 += i_dest_margin;
 932         }
 933     }
 934     else
 935     {
 936         /* use slower SSE2 unaligned fetch and store */
 937         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 938         {
 939             p_line1 = p_line2;
 940             p_line2 += p_dest->p->i_pitch;
 941
 942             p_y1 = p_y2;
 943             p_y2 += p_source->p[Y_PLANE].i_pitch;
 944
 945             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 946             {
 947                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 948             }
 949             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 950             {
 951                 C_YUV420_UYVY( );
 952             }
 953
 954             p_y1 += i_source_margin;
 955             p_y2 += i_source_margin;
 956             p_u += i_source_margin_c;
 957             p_v += i_source_margin_c;
 958             p_line1 += i_dest_margin;
 959             p_line2 += i_dest_margin;
 960         }
 961     }
 962     /* make sure all SSE2 stores are visible thereafter */
 963     SSE2_END;
 964 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 965 }
 966 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 967
 968 /*****************************************************************************
 969  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 970  *****************************************************************************/
 971 #if defined (MODULE_NAME_IS_i420_yuy2)
 972 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
 973                                            picture_t *p_dest )
 974 {
 975     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 976     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 977     uint8_t *p_u = p_source->U_PIXELS;
 978     uint8_t *p_v = p_source->V_PIXELS;
 979
 980     int i_x, i_y;
 981
 982     const int i_source_margin = p_source->p[0].i_pitch
 983                                  - p_source->p[0].i_visible_pitch;
 984     const int i_source_margin_c = p_source->p[1].i_pitch
 985                                  - p_source->p[1].i_visible_pitch;
 986     const int i_dest_margin = p_dest->p->i_pitch
 987                                - p_dest->p->i_visible_pitch;
 988
 989     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 990     {
 991         p_line1 = p_line2;
 992         p_line2 += p_dest->p->i_pitch;
 993
 994         p_y1 = p_y2;
 995         p_y2 += p_source->p[Y_PLANE].i_pitch;
 996
 997         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 998         {
 999             C_YUV420_Y211( );
1000             C_YUV420_Y211( );
1001         }
1002
1003         p_y1 += i_source_margin;
1004         p_y2 += i_source_margin;
1005         p_u += i_source_margin_c;
1006         p_v += i_source_margin_c;
1007         p_line1 += i_dest_margin;
1008         p_line2 += i_dest_margin;
1009     }
1010 }
1011 #endif