modules/video_chroma/i420_yuy2.c

   1 /*****************************************************************************
   2  * i420_yuy2.c : YUV to YUV conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000, 2001 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damien@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_plugin.h>
  35 #include <vlc_filter.h>
  36 #include <vlc_cpu.h>
  37
  38 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) && defined(HAVE_ALTIVEC_H)
  39 #   include <altivec.h>
  40 #endif
  41
  42 #include "i420_yuy2.h"
  43
  44 #define SRC_FOURCC  "I420,IYUV,YV12"
  45
  46 #if defined (MODULE_NAME_IS_i420_yuy2)
  47 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv,Y211"
  48 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  49 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  50 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  51 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv"
  52 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
  53 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
  54 #endif
  55
  56 /*****************************************************************************
  57  * Local and extern prototypes.
  58  *****************************************************************************/
  59 static int  Activate ( vlc_object_t * );
  60
  61 static void I420_YUY2           ( filter_t *, picture_t *, picture_t * );
  62 static void I420_YVYU           ( filter_t *, picture_t *, picture_t * );
  63 static void I420_UYVY           ( filter_t *, picture_t *, picture_t * );
  64 static picture_t *I420_YUY2_Filter    ( filter_t *, picture_t * );
  65 static picture_t *I420_YVYU_Filter    ( filter_t *, picture_t * );
  66 static picture_t *I420_UYVY_Filter    ( filter_t *, picture_t * );
  67 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
  68 static void I420_IUYV           ( filter_t *, picture_t *, picture_t * );
  69 static void I420_cyuv           ( filter_t *, picture_t *, picture_t * );
  70 static picture_t *I420_IUYV_Filter    ( filter_t *, picture_t * );
  71 static picture_t *I420_cyuv_Filter    ( filter_t *, picture_t * );
  72 #endif
  73 #if defined (MODULE_NAME_IS_i420_yuy2)
  74 static void I420_Y211           ( filter_t *, picture_t *, picture_t * );
  75 static picture_t *I420_Y211_Filter    ( filter_t *, picture_t * );
  76 #endif
  77
  78 #ifdef MODULE_NAME_IS_i420_yuy2_mmx
  79 /* Initialize MMX-specific constants */
  80 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL;
  81 static const uint64_t i_80w   = 0x0000000080808080ULL;
  82 #endif
  83
  84 /*****************************************************************************
  85  * Module descriptor.
  86  *****************************************************************************/
  87 vlc_module_begin ()
  88 #if defined (MODULE_NAME_IS_i420_yuy2)
  89     set_description( N_("Conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  90     set_capability( "video filter2", 80 )
  91 # define CPU_CAPABILITY 0
  92 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx)
  93     set_description( N_("MMX conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  94     set_capability( "video filter2", 160 )
  95 # define CPU_CAPABILITY CPU_CAPABILITY_MMX
  96 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2)
  97     set_description( N_("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
  98     set_capability( "video filter2", 250 )
  99 # define CPU_CAPABILITY CPU_CAPABILITY_SSE2
 100 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
 101     set_description(
 102             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
 103     set_capability( "video filter2", 250 )
 104 # define CPU_CAPABILITY CPU_CAPABILITY_ALTIVEC
 105 #endif
 106     set_callbacks( Activate, NULL )
 107 vlc_module_end ()
 108
 109 /*****************************************************************************
 110  * Activate: allocate a chroma function
 111  *****************************************************************************
 112  * This function allocates and initializes a chroma function
 113  *****************************************************************************/
 114 static int Activate( vlc_object_t *p_this )
 115 {
 116     filter_t *p_filter = (filter_t *)p_this;
 117
 118 #if CPU_CAPABILITY
 119     if( !(vlc_CPU() & CPU_CAPABILITY) )
 120         return VLC_EGENERIC;
 121 #endif
 122     if( p_filter->fmt_in.video.i_width & 1
 123      || p_filter->fmt_in.video.i_height & 1 )
 124     {
 125         return -1;
 126     }
 127
 128     if( p_filter->fmt_in.video.i_width != p_filter->fmt_out.video.i_width
 129      || p_filter->fmt_in.video.i_height != p_filter->fmt_out.video.i_height )
 130         return -1;
 131
 132     switch( p_filter->fmt_in.video.i_chroma )
 133     {
 134         case VLC_CODEC_YV12:
 135         case VLC_CODEC_I420:
 136             switch( p_filter->fmt_out.video.i_chroma )
 137             {
 138                 case VLC_CODEC_YUYV:
 139                     p_filter->pf_video_filter = I420_YUY2_Filter;
 140                     break;
 141
 142                 case VLC_CODEC_YVYU:
 143                     p_filter->pf_video_filter = I420_YVYU_Filter;
 144                     break;
 145
 146                 case VLC_CODEC_UYVY:
 147                     p_filter->pf_video_filter = I420_UYVY_Filter;
 148                     break;
 149 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 150                 case VLC_FOURCC('I','U','Y','V'):
 151                     p_filter->pf_video_filter = I420_IUYV_Filter;
 152                     break;
 153
 154                 case VLC_CODEC_CYUV:
 155                     p_filter->pf_video_filter = I420_cyuv_Filter;
 156                     break;
 157 #endif
 158
 159 #if defined (MODULE_NAME_IS_i420_yuy2)
 160                 case VLC_CODEC_Y211:
 161                     p_filter->pf_video_filter = I420_Y211_Filter;
 162                     break;
 163 #endif
 164
 165                 default:
 166                     return -1;
 167             }
 168             break;
 169
 170         default:
 171             return -1;
 172     }
 173
 174     return 0;
 175 }
 176
 177 #if 0
 178 static inline unsigned long long read_cycles(void)
 179 {
 180     unsigned long long v;
 181     __asm__ __volatile__("rdtsc" : "=A" (v): );
 182
 183     return v;
 184 }
 185 #endif
 186
 187 /* Following functions are local */
 188
 189 VIDEO_FILTER_WRAPPER( I420_YUY2 )
 190 VIDEO_FILTER_WRAPPER( I420_YVYU )
 191 VIDEO_FILTER_WRAPPER( I420_UYVY )
 192 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 193 VIDEO_FILTER_WRAPPER( I420_IUYV )
 194 VIDEO_FILTER_WRAPPER( I420_cyuv )
 195 #endif
 196 #if defined (MODULE_NAME_IS_i420_yuy2)
 197 VIDEO_FILTER_WRAPPER( I420_Y211 )
 198 #endif
 199
 200 /*****************************************************************************
 201  * I420_YUY2: planar YUV 4:2:0 to packed YUYV 4:2:2
 202  *****************************************************************************/
 203 static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
 204                                            picture_t *p_dest )
 205 {
 206     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 207     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 208     uint8_t *p_u = p_source->U_PIXELS;
 209     uint8_t *p_v = p_source->V_PIXELS;
 210
 211     int i_x, i_y;
 212
 213 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 214 #define VEC_NEXT_LINES( ) \
 215     p_line1  = p_line2; \
 216     p_line2 += p_dest->p->i_pitch; \
 217     p_y1     = p_y2; \
 218     p_y2    += p_source->p[Y_PLANE].i_pitch;
 219
 220 #define VEC_LOAD_UV( ) \
 221     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 222     v_vec = vec_ld( 0, p_v ); p_v += 16;
 223
 224 #define VEC_MERGE( a ) \
 225     uv_vec = a( u_vec, v_vec ); \
 226     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 227     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 228     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line1 ); p_line1 += 16; \
 229     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 230     vec_st( vec_mergeh( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16; \
 231     vec_st( vec_mergel( y_vec, uv_vec ), 0, p_line2 ); p_line2 += 16;
 232
 233     vector unsigned char u_vec;
 234     vector unsigned char v_vec;
 235     vector unsigned char uv_vec;
 236     vector unsigned char y_vec;
 237
 238     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 239            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 240     {
 241         /* Width is a multiple of 32, we take 2 lines at a time */
 242         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 243         {
 244             VEC_NEXT_LINES( );
 245             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 246             {
 247                 VEC_LOAD_UV( );
 248                 VEC_MERGE( vec_mergeh );
 249                 VEC_MERGE( vec_mergel );
 250             }
 251         }
 252     }
 253 #warning FIXME: converting widths % 16 but !widths % 32 is broken on altivec
 254 #if 0
 255     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 256                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 257     {
 258         /* Width is only a multiple of 16, we take 4 lines at a time */
 259         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 260         {
 261             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 262             VEC_NEXT_LINES( );
 263             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 264             {
 265                 VEC_LOAD_UV( );
 266                 VEC_MERGE( vec_mergeh );
 267                 VEC_MERGE( vec_mergel );
 268             }
 269
 270             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 271             VEC_LOAD_UV( );
 272             VEC_MERGE( vec_mergeh );
 273
 274             /* Line 3 and 4, pixels 0 to 16 */
 275             VEC_NEXT_LINES( );
 276             VEC_MERGE( vec_mergel );
 277
 278             /* Line 3 and 4, pixels 16 to ( width ) */
 279             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 280             {
 281                 VEC_LOAD_UV( );
 282                 VEC_MERGE( vec_mergeh );
 283                 VEC_MERGE( vec_mergel );
 284             }
 285         }
 286     }
 287 #endif
 288     else
 289     {
 290         /* Crap, use the C version */
 291 #undef VEC_NEXT_LINES
 292 #undef VEC_LOAD_UV
 293 #undef VEC_MERGE
 294 #endif
 295
 296     const int i_source_margin = p_source->p[0].i_pitch
 297                                  - p_source->p[0].i_visible_pitch;
 298     const int i_source_margin_c = p_source->p[1].i_pitch
 299                                  - p_source->p[1].i_visible_pitch;
 300     const int i_dest_margin = p_dest->p->i_pitch
 301                                - p_dest->p->i_visible_pitch;
 302
 303 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 304     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 305     {
 306         p_line1 = p_line2;
 307         p_line2 += p_dest->p->i_pitch;
 308
 309         p_y1 = p_y2;
 310         p_y2 += p_source->p[Y_PLANE].i_pitch;
 311
 312 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 313         for( i_x = p_filter->fmt_in.video.i_width / 8; i_x-- ; )
 314         {
 315             C_YUV420_YUYV( );
 316             C_YUV420_YUYV( );
 317             C_YUV420_YUYV( );
 318             C_YUV420_YUYV( );
 319         }
 320 #else
 321         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 322         {
 323             MMX_CALL( MMX_YUV420_YUYV );
 324         }
 325 #endif
 326         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 327         {
 328             C_YUV420_YUYV( );
 329         }
 330
 331         p_y1 += i_source_margin;
 332         p_y2 += i_source_margin;
 333         p_u += i_source_margin_c;
 334         p_v += i_source_margin_c;
 335         p_line1 += i_dest_margin;
 336         p_line2 += i_dest_margin;
 337     }
 338
 339 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 340     /* re-enable FPU registers */
 341     MMX_END;
 342 #endif
 343
 344 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 345     }
 346 #endif
 347
 348 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 349     /*
 350     ** SSE2 128 bits fetch/store instructions are faster
 351     ** if memory access is 16 bytes aligned
 352     */
 353
 354     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 355         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 356     {
 357         /* use faster SSE2 aligned fetch and store */
 358         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 359         {
 360             p_line1 = p_line2;
 361             p_line2 += p_dest->p->i_pitch;
 362
 363             p_y1 = p_y2;
 364             p_y2 += p_source->p[Y_PLANE].i_pitch;
 365
 366             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 367             {
 368                 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED );
 369             }
 370             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 371             {
 372                 C_YUV420_YUYV( );
 373             }
 374
 375             p_y1 += i_source_margin;
 376             p_y2 += i_source_margin;
 377             p_u += i_source_margin_c;
 378             p_v += i_source_margin_c;
 379             p_line1 += i_dest_margin;
 380             p_line2 += i_dest_margin;
 381         }
 382     }
 383     else
 384     {
 385         /* use slower SSE2 unaligned fetch and store */
 386         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 387         {
 388             p_line1 = p_line2;
 389             p_line2 += p_dest->p->i_pitch;
 390
 391             p_y1 = p_y2;
 392             p_y2 += p_source->p[Y_PLANE].i_pitch;
 393
 394             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 395             {
 396                 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED );
 397             }
 398             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 399             {
 400                 C_YUV420_YUYV( );
 401             }
 402
 403             p_y1 += i_source_margin;
 404             p_y2 += i_source_margin;
 405             p_u += i_source_margin_c;
 406             p_v += i_source_margin_c;
 407             p_line1 += i_dest_margin;
 408             p_line2 += i_dest_margin;
 409         }
 410     }
 411     /* make sure all SSE2 stores are visible thereafter */
 412     SSE2_END;
 413
 414 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 415 }
 416
 417 /*****************************************************************************
 418  * I420_YVYU: planar YUV 4:2:0 to packed YVYU 4:2:2
 419  *****************************************************************************/
 420 static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
 421                                            picture_t *p_dest )
 422 {
 423     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 424     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 425     uint8_t *p_u = p_source->U_PIXELS;
 426     uint8_t *p_v = p_source->V_PIXELS;
 427
 428     int i_x, i_y;
 429
 430 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 431 #define VEC_NEXT_LINES( ) \
 432     p_line1  = p_line2; \
 433     p_line2 += p_dest->p->i_pitch; \
 434     p_y1     = p_y2; \
 435     p_y2    += p_source->p[Y_PLANE].i_pitch;
 436
 437 #define VEC_LOAD_UV( ) \
 438     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 439     v_vec = vec_ld( 0, p_v ); p_v += 16;
 440
 441 #define VEC_MERGE( a ) \
 442     vu_vec = a( v_vec, u_vec ); \
 443     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 444     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 445     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line1 ); p_line1 += 16; \
 446     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 447     vec_st( vec_mergeh( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16; \
 448     vec_st( vec_mergel( y_vec, vu_vec ), 0, p_line2 ); p_line2 += 16;
 449
 450     vector unsigned char u_vec;
 451     vector unsigned char v_vec;
 452     vector unsigned char vu_vec;
 453     vector unsigned char y_vec;
 454
 455     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 456            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 457     {
 458         /* Width is a multiple of 32, we take 2 lines at a time */
 459         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 460         {
 461             VEC_NEXT_LINES( );
 462             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 463             {
 464                 VEC_LOAD_UV( );
 465                 VEC_MERGE( vec_mergeh );
 466                 VEC_MERGE( vec_mergel );
 467             }
 468         }
 469     }
 470     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 471                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 472     {
 473         /* Width is only a multiple of 16, we take 4 lines at a time */
 474         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 475         {
 476             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 477             VEC_NEXT_LINES( );
 478             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 479             {
 480                 VEC_LOAD_UV( );
 481                 VEC_MERGE( vec_mergeh );
 482                 VEC_MERGE( vec_mergel );
 483             }
 484
 485             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 486             VEC_LOAD_UV( );
 487             VEC_MERGE( vec_mergeh );
 488
 489             /* Line 3 and 4, pixels 0 to 16 */
 490             VEC_NEXT_LINES( );
 491             VEC_MERGE( vec_mergel );
 492
 493             /* Line 3 and 4, pixels 16 to ( width ) */
 494             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 495             {
 496                 VEC_LOAD_UV( );
 497                 VEC_MERGE( vec_mergeh );
 498                 VEC_MERGE( vec_mergel );
 499             }
 500         }
 501     }
 502     else
 503     {
 504         /* Crap, use the C version */
 505 #undef VEC_NEXT_LINES
 506 #undef VEC_LOAD_UV
 507 #undef VEC_MERGE
 508 #endif
 509
 510     const int i_source_margin = p_source->p[0].i_pitch
 511                                  - p_source->p[0].i_visible_pitch;
 512     const int i_source_margin_c = p_source->p[1].i_pitch
 513                                  - p_source->p[1].i_visible_pitch;
 514     const int i_dest_margin = p_dest->p->i_pitch
 515                                - p_dest->p->i_visible_pitch;
 516
 517 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 518     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 519     {
 520         p_line1 = p_line2;
 521         p_line2 += p_dest->p->i_pitch;
 522
 523         p_y1 = p_y2;
 524         p_y2 += p_source->p[Y_PLANE].i_pitch;
 525
 526         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 527         {
 528 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 529             C_YUV420_YVYU( );
 530             C_YUV420_YVYU( );
 531             C_YUV420_YVYU( );
 532             C_YUV420_YVYU( );
 533 #else
 534             MMX_CALL( MMX_YUV420_YVYU );
 535 #endif
 536         }
 537         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 538         {
 539             C_YUV420_YVYU( );
 540         }
 541
 542         p_y1 += i_source_margin;
 543         p_y2 += i_source_margin;
 544         p_u += i_source_margin_c;
 545         p_v += i_source_margin_c;
 546         p_line1 += i_dest_margin;
 547         p_line2 += i_dest_margin;
 548     }
 549
 550 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 551     /* re-enable FPU registers */
 552     MMX_END;
 553 #endif
 554
 555 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 556     }
 557 #endif
 558
 559 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 560     /*
 561     ** SSE2 128 bits fetch/store instructions are faster
 562     ** if memory access is 16 bytes aligned
 563     */
 564     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 565         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 566     {
 567         /* use faster SSE2 aligned fetch and store */
 568         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 569         {
 570             p_line1 = p_line2;
 571             p_line2 += p_dest->p->i_pitch;
 572
 573             p_y1 = p_y2;
 574             p_y2 += p_source->p[Y_PLANE].i_pitch;
 575
 576             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 577             {
 578                 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED );
 579             }
 580             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 581             {
 582                 C_YUV420_YVYU( );
 583             }
 584
 585             p_y1 += i_source_margin;
 586             p_y2 += i_source_margin;
 587             p_u += i_source_margin_c;
 588             p_v += i_source_margin_c;
 589             p_line1 += i_dest_margin;
 590             p_line2 += i_dest_margin;
 591         }
 592     }
 593     else
 594     {
 595         /* use slower SSE2 unaligned fetch and store */
 596         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 597         {
 598             p_line1 = p_line2;
 599             p_line2 += p_dest->p->i_pitch;
 600
 601             p_y1 = p_y2;
 602             p_y2 += p_source->p[Y_PLANE].i_pitch;
 603
 604             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 605             {
 606                 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED );
 607             }
 608             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 609             {
 610                 C_YUV420_YVYU( );
 611             }
 612
 613             p_y1 += i_source_margin;
 614             p_y2 += i_source_margin;
 615             p_u += i_source_margin_c;
 616             p_v += i_source_margin_c;
 617             p_line1 += i_dest_margin;
 618             p_line2 += i_dest_margin;
 619         }
 620     }
 621     /* make sure all SSE2 stores are visible thereafter */
 622     SSE2_END;
 623 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 624 }
 625
 626 /*****************************************************************************
 627  * I420_UYVY: planar YUV 4:2:0 to packed UYVY 4:2:2
 628  *****************************************************************************/
 629 static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
 630                                            picture_t *p_dest )
 631 {
 632     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 633     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 634     uint8_t *p_u = p_source->U_PIXELS;
 635     uint8_t *p_v = p_source->V_PIXELS;
 636
 637     int i_x, i_y;
 638
 639 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 640 #define VEC_NEXT_LINES( ) \
 641     p_line1  = p_line2; \
 642     p_line2 += p_dest->p->i_pitch; \
 643     p_y1     = p_y2; \
 644     p_y2    += p_source->p[Y_PLANE].i_pitch;
 645
 646 #define VEC_LOAD_UV( ) \
 647     u_vec = vec_ld( 0, p_u ); p_u += 16; \
 648     v_vec = vec_ld( 0, p_v ); p_v += 16;
 649
 650 #define VEC_MERGE( a ) \
 651     uv_vec = a( u_vec, v_vec ); \
 652     y_vec = vec_ld( 0, p_y1 ); p_y1 += 16; \
 653     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 654     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line1 ); p_line1 += 16; \
 655     y_vec = vec_ld( 0, p_y2 ); p_y2 += 16; \
 656     vec_st( vec_mergeh( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16; \
 657     vec_st( vec_mergel( uv_vec, y_vec ), 0, p_line2 ); p_line2 += 16;
 658
 659     vector unsigned char u_vec;
 660     vector unsigned char v_vec;
 661     vector unsigned char uv_vec;
 662     vector unsigned char y_vec;
 663
 664     if( !( ( p_filter->fmt_in.video.i_width % 32 ) |
 665            ( p_filter->fmt_in.video.i_height % 2 ) ) )
 666     {
 667         /* Width is a multiple of 32, we take 2 lines at a time */
 668         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 669         {
 670             VEC_NEXT_LINES( );
 671             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 672             {
 673                 VEC_LOAD_UV( );
 674                 VEC_MERGE( vec_mergeh );
 675                 VEC_MERGE( vec_mergel );
 676             }
 677         }
 678     }
 679     else if( !( ( p_filter->fmt_in.video.i_width % 16 ) |
 680                 ( p_filter->fmt_in.video.i_height % 4 ) ) )
 681     {
 682         /* Width is only a multiple of 16, we take 4 lines at a time */
 683         for( i_y = p_filter->fmt_in.video.i_height / 4 ; i_y-- ; )
 684         {
 685             /* Line 1 and 2, pixels 0 to ( width - 16 ) */
 686             VEC_NEXT_LINES( );
 687             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 688             {
 689                 VEC_LOAD_UV( );
 690                 VEC_MERGE( vec_mergeh );
 691                 VEC_MERGE( vec_mergel );
 692             }
 693
 694             /* Line 1 and 2, pixels ( width - 16 ) to ( width ) */
 695             VEC_LOAD_UV( );
 696             VEC_MERGE( vec_mergeh );
 697
 698             /* Line 3 and 4, pixels 0 to 16 */
 699             VEC_NEXT_LINES( );
 700             VEC_MERGE( vec_mergel );
 701
 702             /* Line 3 and 4, pixels 16 to ( width ) */
 703             for( i_x = p_filter->fmt_in.video.i_width / 32 ; i_x-- ; )
 704             {
 705                 VEC_LOAD_UV( );
 706                 VEC_MERGE( vec_mergeh );
 707                 VEC_MERGE( vec_mergel );
 708             }
 709         }
 710     }
 711     else
 712     {
 713         /* Crap, use the C version */
 714 #undef VEC_NEXT_LINES
 715 #undef VEC_LOAD_UV
 716 #undef VEC_MERGE
 717 #endif
 718
 719     const int i_source_margin = p_source->p[0].i_pitch
 720                                  - p_source->p[0].i_visible_pitch;
 721     const int i_source_margin_c = p_source->p[1].i_pitch
 722                                  - p_source->p[1].i_visible_pitch;
 723     const int i_dest_margin = p_dest->p->i_pitch
 724                                - p_dest->p->i_visible_pitch;
 725
 726 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 727     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 728     {
 729         p_line1 = p_line2;
 730         p_line2 += p_dest->p->i_pitch;
 731
 732         p_y1 = p_y2;
 733         p_y2 += p_source->p[Y_PLANE].i_pitch;
 734
 735         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 736         {
 737 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 738             C_YUV420_UYVY( );
 739             C_YUV420_UYVY( );
 740             C_YUV420_UYVY( );
 741             C_YUV420_UYVY( );
 742 #else
 743             MMX_CALL( MMX_YUV420_UYVY );
 744 #endif
 745         }
 746         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x--; )
 747         {
 748             C_YUV420_UYVY( );
 749         }
 750
 751         p_y1 += i_source_margin;
 752         p_y2 += i_source_margin;
 753         p_u += i_source_margin_c;
 754         p_v += i_source_margin_c;
 755         p_line1 += i_dest_margin;
 756         p_line2 += i_dest_margin;
 757     }
 758
 759 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 760     /* re-enable FPU registers */
 761     MMX_END;
 762 #endif
 763
 764 #if defined (MODULE_NAME_IS_i420_yuy2_altivec)
 765     }
 766 #endif
 767
 768 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 769     /*
 770     ** SSE2 128 bits fetch/store instructions are faster
 771     ** if memory access is 16 bytes aligned
 772     */
 773     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 774         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 775     {
 776         /* use faster SSE2 aligned fetch and store */
 777         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 778         {
 779             p_line1 = p_line2;
 780             p_line2 += p_dest->p->i_pitch;
 781
 782             p_y1 = p_y2;
 783             p_y2 += p_source->p[Y_PLANE].i_pitch;
 784
 785             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 786             {
 787                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 788             }
 789             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 790             {
 791                 C_YUV420_UYVY( );
 792             }
 793
 794             p_y1 += i_source_margin;
 795             p_y2 += i_source_margin;
 796             p_u += i_source_margin_c;
 797             p_v += i_source_margin_c;
 798             p_line1 += i_dest_margin;
 799             p_line2 += i_dest_margin;
 800         }
 801     }
 802     else
 803     {
 804         /* use slower SSE2 unaligned fetch and store */
 805         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 806         {
 807             p_line1 = p_line2;
 808             p_line2 += p_dest->p->i_pitch;
 809
 810             p_y1 = p_y2;
 811             p_y2 += p_source->p[Y_PLANE].i_pitch;
 812
 813             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 814             {
 815                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 816             }
 817             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 818             {
 819                 C_YUV420_UYVY( );
 820             }
 821
 822             p_y1 += i_source_margin;
 823             p_y2 += i_source_margin;
 824             p_u += i_source_margin_c;
 825             p_v += i_source_margin_c;
 826             p_line1 += i_dest_margin;
 827             p_line2 += i_dest_margin;
 828         }
 829     }
 830     /* make sure all SSE2 stores are visible thereafter */
 831     SSE2_END;
 832 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 833 }
 834
 835 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 836 /*****************************************************************************
 837  * I420_IUYV: planar YUV 4:2:0 to interleaved packed UYVY 4:2:2
 838  *****************************************************************************/
 839 static void I420_IUYV( filter_t *p_filter, picture_t *p_source,
 840                                            picture_t *p_dest )
 841 {
 842     VLC_UNUSED(p_source); VLC_UNUSED(p_dest);
 843     /* FIXME: TODO ! */
 844     msg_Err( p_filter, "I420_IUYV unimplemented, please harass <sam@zoy.org>" );
 845 }
 846
 847 /*****************************************************************************
 848  * I420_cyuv: planar YUV 4:2:0 to upside-down packed UYVY 4:2:2
 849  *****************************************************************************/
 850 static void I420_cyuv( filter_t *p_filter, picture_t *p_source,
 851                                            picture_t *p_dest )
 852 {
 853     uint8_t *p_line1 = p_dest->p->p_pixels +
 854                        p_dest->p->i_visible_lines * p_dest->p->i_pitch
 855                        + p_dest->p->i_pitch;
 856     uint8_t *p_line2 = p_dest->p->p_pixels +
 857                        p_dest->p->i_visible_lines * p_dest->p->i_pitch;
 858     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 859     uint8_t *p_u = p_source->U_PIXELS;
 860     uint8_t *p_v = p_source->V_PIXELS;
 861
 862     int i_x, i_y;
 863
 864     const int i_source_margin = p_source->p[0].i_pitch
 865                                  - p_source->p[0].i_visible_pitch;
 866     const int i_source_margin_c = p_source->p[1].i_pitch
 867                                  - p_source->p[1].i_visible_pitch;
 868     const int i_dest_margin = p_dest->p->i_pitch
 869                                - p_dest->p->i_visible_pitch;
 870
 871 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
 872     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 873     {
 874         p_line1 -= 3 * p_dest->p->i_pitch;
 875         p_line2 -= 3 * p_dest->p->i_pitch;
 876
 877         p_y1 = p_y2;
 878         p_y2 += p_source->p[Y_PLANE].i_pitch;
 879
 880         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
 881         {
 882 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx)
 883             C_YUV420_UYVY( );
 884             C_YUV420_UYVY( );
 885             C_YUV420_UYVY( );
 886             C_YUV420_UYVY( );
 887 #else
 888             MMX_CALL( MMX_YUV420_UYVY );
 889 #endif
 890         }
 891         for( i_x = ( p_filter->fmt_in.video.i_width % 8 ) / 2; i_x-- ; )
 892         {
 893             C_YUV420_UYVY( );
 894         }
 895
 896         p_y1 += i_source_margin;
 897         p_y2 += i_source_margin;
 898         p_u += i_source_margin_c;
 899         p_v += i_source_margin_c;
 900         p_line1 += i_dest_margin;
 901         p_line2 += i_dest_margin;
 902     }
 903
 904 #if defined (MODULE_NAME_IS_i420_yuy2_mmx)
 905     /* re-enable FPU registers */
 906     MMX_END;
 907 #endif
 908
 909 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 910     /*
 911     ** SSE2 128 bits fetch/store instructions are faster
 912     ** if memory access is 16 bytes aligned
 913     */
 914     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
 915         ((intptr_t)p_line2|(intptr_t)p_y2))) )
 916     {
 917         /* use faster SSE2 aligned fetch and store */
 918         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 919         {
 920             p_line1 = p_line2;
 921             p_line2 += p_dest->p->i_pitch;
 922
 923             p_y1 = p_y2;
 924             p_y2 += p_source->p[Y_PLANE].i_pitch;
 925
 926             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 927             {
 928                 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED );
 929             }
 930             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 931             {
 932                 C_YUV420_UYVY( );
 933             }
 934
 935             p_y1 += i_source_margin;
 936             p_y2 += i_source_margin;
 937             p_u += i_source_margin_c;
 938             p_v += i_source_margin_c;
 939             p_line1 += i_dest_margin;
 940             p_line2 += i_dest_margin;
 941         }
 942     }
 943     else
 944     {
 945         /* use slower SSE2 unaligned fetch and store */
 946         for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 947         {
 948             p_line1 = p_line2;
 949             p_line2 += p_dest->p->i_pitch;
 950
 951             p_y1 = p_y2;
 952             p_y2 += p_source->p[Y_PLANE].i_pitch;
 953
 954             for( i_x = p_filter->fmt_in.video.i_width / 16 ; i_x-- ; )
 955             {
 956                 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED );
 957             }
 958             for( i_x = ( p_filter->fmt_in.video.i_width % 16 ) / 2; i_x-- ; )
 959             {
 960                 C_YUV420_UYVY( );
 961             }
 962
 963             p_y1 += i_source_margin;
 964             p_y2 += i_source_margin;
 965             p_u += i_source_margin_c;
 966             p_v += i_source_margin_c;
 967             p_line1 += i_dest_margin;
 968             p_line2 += i_dest_margin;
 969         }
 970     }
 971     /* make sure all SSE2 stores are visible thereafter */
 972     SSE2_END;
 973 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
 974 }
 975 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec)
 976
 977 /*****************************************************************************
 978  * I420_Y211: planar YUV 4:2:0 to packed YUYV 2:1:1
 979  *****************************************************************************/
 980 #if defined (MODULE_NAME_IS_i420_yuy2)
 981 static void I420_Y211( filter_t *p_filter, picture_t *p_source,
 982                                            picture_t *p_dest )
 983 {
 984     uint8_t *p_line1, *p_line2 = p_dest->p->p_pixels;
 985     uint8_t *p_y1, *p_y2 = p_source->Y_PIXELS;
 986     uint8_t *p_u = p_source->U_PIXELS;
 987     uint8_t *p_v = p_source->V_PIXELS;
 988
 989     int i_x, i_y;
 990
 991     const int i_source_margin = p_source->p[0].i_pitch
 992                                  - p_source->p[0].i_visible_pitch;
 993     const int i_source_margin_c = p_source->p[1].i_pitch
 994                                  - p_source->p[1].i_visible_pitch;
 995     const int i_dest_margin = p_dest->p->i_pitch
 996                                - p_dest->p->i_visible_pitch;
 997
 998     for( i_y = p_filter->fmt_in.video.i_height / 2 ; i_y-- ; )
 999     {
1000         p_line1 = p_line2;
1001         p_line2 += p_dest->p->i_pitch;
1002
1003         p_y1 = p_y2;
1004         p_y2 += p_source->p[Y_PLANE].i_pitch;
1005
1006         for( i_x = p_filter->fmt_in.video.i_width / 8 ; i_x-- ; )
1007         {
1008             C_YUV420_Y211( );
1009             C_YUV420_Y211( );
1010         }
1011
1012         p_y1 += i_source_margin;
1013         p_y2 += i_source_margin;
1014         p_u += i_source_margin_c;
1015         p_v += i_source_margin_c;
1016         p_line1 += i_dest_margin;
1017         p_line2 += i_dest_margin;
1018     }
1019 }
1020 #endif