modules/video_chroma/i420_rgb16.c

   1 /*****************************************************************************
   2  * i420_rgb16.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 the VideoLAN team
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 /*****************************************************************************
  26  * Preamble
  27  *****************************************************************************/
  28
  29 #ifdef HAVE_CONFIG_H
  30 # include "config.h"
  31 #endif
  32
  33 #include <vlc_common.h>
  34 #include <vlc_filter.h>
  35
  36 #include "i420_rgb.h"
  37 #if defined (MODULE_NAME_IS_i420_rgb)
  38 #   include "i420_rgb_c.h"
  39 #elif defined (MODULE_NAME_IS_i420_rgb_mmx)
  40 #   include "../mmx/i420_rgb_mmx.h"
  41 #elif defined (MODULE_NAME_IS_i420_rgb_sse2)
  42 #   include "../mmx/i420_rgb_mmx.h"
  43 #endif
  44
  45 static void SetOffset( int, int, int, int, bool *,
  46                        unsigned int *, int * );
  47
  48 #if defined (MODULE_NAME_IS_i420_rgb)
  49 /*****************************************************************************
  50  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp with dithering
  51  *****************************************************************************
  52  * Horizontal alignment needed:
  53  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
  54  *  - output: 1 pixel (2 bytes), margins allowed
  55  * Vertical alignment needed:
  56  *  - input: 2 lines (2 Y lines, 1 U/V line)
  57  *  - output: 1 line
  58  *****************************************************************************/
  59 void I420_RGB16_dither( filter_t *p_filter, picture_t *p_src,
  60                                                 picture_t *p_dest )
  61 {
  62     /* We got this one from the old arguments */
  63     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
  64     uint8_t  *p_y   = p_src->Y_PIXELS;
  65     uint8_t  *p_u   = p_src->U_PIXELS;
  66     uint8_t  *p_v   = p_src->V_PIXELS;
  67
  68     bool   b_hscale;                        /* horizontal scaling type */
  69     unsigned int i_vscale;                          /* vertical scaling type */
  70     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
  71     unsigned int i_real_y;                                          /* y % 4 */
  72
  73     int         i_right_margin;
  74     int         i_rewind;
  75     int         i_scale_count;                       /* scale modulo counter */
  76     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
  77     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
  78     int         i_uval, i_vval;                           /* U and V samples */
  79     int         i_red, i_green, i_blue;          /* U and V modified samples */
  80     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
  81     uint16_t *  p_ybase;                     /* Y dependant conversion table */
  82
  83     /* Conversion buffer pointer */
  84     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
  85     uint16_t *  p_buffer;
  86
  87     /* Offset array pointer */
  88     int *       p_offset_start = p_filter->p_sys->p_offset;
  89     int *       p_offset;
  90
  91     const int i_source_margin = p_src->p[0].i_pitch
  92                                  - p_src->p[0].i_visible_pitch;
  93     const int i_source_margin_c = p_src->p[1].i_pitch
  94                                  - p_src->p[1].i_visible_pitch;
  95
  96     /* The dithering matrices */
  97     int dither10[4] = {  0x0,  0x8,  0x2,  0xa };
  98     int dither11[4] = {  0xc,  0x4,  0xe,  0x6 };
  99     int dither12[4] = {  0x3,  0xb,  0x1,  0x9 };
 100     int dither13[4] = {  0xf,  0x7,  0xd,  0x5 };
 101
 102     for(i_x = 0; i_x < 4; i_x++)
 103     {
 104         dither10[i_x] = dither10[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 105         dither11[i_x] = dither11[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 106         dither12[i_x] = dither12[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 107         dither13[i_x] = dither13[i_x] << (SHIFT - 4 + p_filter->fmt_out.video.i_rrshift);
 108     }
 109
 110     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 111     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 112
 113     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 114      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 115      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 116     SetOffset( p_filter->fmt_in.video.i_width,
 117                p_filter->fmt_in.video.i_height,
 118                p_filter->fmt_out.video.i_width,
 119                p_filter->fmt_out.video.i_height,
 120                &b_hscale, &i_vscale, p_offset_start );
 121
 122     /*
 123      * Perform conversion
 124      */
 125     i_scale_count = ( i_vscale == 1 ) ?
 126                     p_filter->fmt_out.video.i_height :
 127                     p_filter->fmt_in.video.i_height;
 128     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 129     {
 130         i_real_y = i_y & 0x3;
 131         p_pic_start = p_pic;
 132         p_buffer = b_hscale ? p_buffer_start : p_pic;
 133
 134         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 135         {
 136             int *p_dither = dither10;
 137             CONVERT_YUV_PIXEL_DITHER(2);
 138             p_dither = dither11;
 139             CONVERT_Y_PIXEL_DITHER(2);
 140             p_dither = dither12;
 141             CONVERT_YUV_PIXEL_DITHER(2);
 142             p_dither = dither13;
 143             CONVERT_Y_PIXEL_DITHER(2);
 144             p_dither = dither10;
 145             CONVERT_YUV_PIXEL_DITHER(2);
 146             p_dither = dither11;
 147             CONVERT_Y_PIXEL_DITHER(2);
 148             p_dither = dither12;
 149             CONVERT_YUV_PIXEL_DITHER(2);
 150             p_dither = dither13;
 151             CONVERT_Y_PIXEL_DITHER(2);
 152         }
 153
 154         /* Here we do some unaligned reads and duplicate conversions, but
 155          * at least we have all the pixels */
 156         if( i_rewind )
 157         {
 158             int *p_dither = dither10;
 159             p_y -= i_rewind;
 160             p_u -= i_rewind >> 1;
 161             p_v -= i_rewind >> 1;
 162             p_buffer -= i_rewind;
 163             CONVERT_YUV_PIXEL_DITHER(2);
 164             p_dither = dither11;
 165             CONVERT_Y_PIXEL_DITHER(2);
 166             p_dither = dither12;
 167             CONVERT_YUV_PIXEL_DITHER(2);
 168             p_dither = dither13;
 169             CONVERT_Y_PIXEL_DITHER(2);
 170             p_dither = dither10;
 171             CONVERT_YUV_PIXEL_DITHER(2);
 172             p_dither = dither11;
 173             CONVERT_Y_PIXEL_DITHER(2);
 174             p_dither = dither12;
 175             CONVERT_YUV_PIXEL_DITHER(2);
 176             p_dither = dither13;
 177             CONVERT_Y_PIXEL_DITHER(2);
 178         }
 179         SCALE_WIDTH;
 180         SCALE_HEIGHT( 420, 2 );
 181
 182         p_y += i_source_margin;
 183         if( i_y % 2 )
 184         {
 185             p_u += i_source_margin_c;
 186             p_v += i_source_margin_c;
 187         }
 188     }
 189 }
 190 #endif
 191
 192 /*****************************************************************************
 193  * I420_RGB16: color YUV 4:2:0 to RGB 16 bpp
 194  *****************************************************************************
 195  * Horizontal alignment needed:
 196  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 197  *  - output: 1 pixel (2 bytes), margins allowed
 198  * Vertical alignment needed:
 199  *  - input: 2 lines (2 Y lines, 1 U/V line)
 200  *  - output: 1 line
 201  *****************************************************************************/
 202
 203 #if defined (MODULE_NAME_IS_i420_rgb)
 204
 205 void I420_RGB16( filter_t *p_filter, picture_t *p_src,
 206                                          picture_t *p_dest )
 207 {
 208     /* We got this one from the old arguments */
 209     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 210     uint8_t  *p_y   = p_src->Y_PIXELS;
 211     uint8_t  *p_u   = p_src->U_PIXELS;
 212     uint8_t  *p_v   = p_src->V_PIXELS;
 213
 214     bool  b_hscale;                         /* horizontal scaling type */
 215     unsigned int i_vscale;                          /* vertical scaling type */
 216     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 217
 218     int         i_right_margin;
 219     int         i_rewind;
 220     int         i_scale_count;                       /* scale modulo counter */
 221     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 222     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 223     int         i_uval, i_vval;                           /* U and V samples */
 224     int         i_red, i_green, i_blue;          /* U and V modified samples */
 225     uint16_t *  p_yuv = p_filter->p_sys->p_rgb16;
 226     uint16_t *  p_ybase;                     /* Y dependant conversion table */
 227
 228     /* Conversion buffer pointer */
 229     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 230     uint16_t *  p_buffer;
 231
 232     /* Offset array pointer */
 233     int *       p_offset_start = p_filter->p_sys->p_offset;
 234     int *       p_offset;
 235
 236     const int i_source_margin = p_src->p[0].i_pitch
 237                                  - p_src->p[0].i_visible_pitch;
 238     const int i_source_margin_c = p_src->p[1].i_pitch
 239                                  - p_src->p[1].i_visible_pitch;
 240
 241     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 242     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 243
 244     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 245      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 246      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 247     SetOffset( p_filter->fmt_in.video.i_width,
 248                p_filter->fmt_in.video.i_height,
 249                p_filter->fmt_out.video.i_width,
 250                p_filter->fmt_out.video.i_height,
 251                &b_hscale, &i_vscale, p_offset_start );
 252
 253     /*
 254      * Perform conversion
 255      */
 256     i_scale_count = ( i_vscale == 1 ) ?
 257                     p_filter->fmt_out.video.i_height :
 258                     p_filter->fmt_in.video.i_height;
 259     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 260     {
 261         p_pic_start = p_pic;
 262         p_buffer = b_hscale ? p_buffer_start : p_pic;
 263
 264         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 265         {
 266             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 267             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 268             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 269             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 270         }
 271
 272         /* Here we do some unaligned reads and duplicate conversions, but
 273          * at least we have all the pixels */
 274         if( i_rewind )
 275         {
 276             p_y -= i_rewind;
 277             p_u -= i_rewind >> 1;
 278             p_v -= i_rewind >> 1;
 279             p_buffer -= i_rewind;
 280
 281             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 282             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 283             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 284             CONVERT_YUV_PIXEL(2);  CONVERT_Y_PIXEL(2);
 285         }
 286         SCALE_WIDTH;
 287         SCALE_HEIGHT( 420, 2 );
 288
 289         p_y += i_source_margin;
 290         if( i_y % 2 )
 291         {
 292             p_u += i_source_margin_c;
 293             p_v += i_source_margin_c;
 294         }
 295     }
 296 }
 297
 298 #else // ! defined (MODULE_NAME_IS_i420_rgb)
 299
 300 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src,
 301                                           picture_t *p_dest )
 302 {
 303     /* We got this one from the old arguments */
 304     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 305     uint8_t  *p_y   = p_src->Y_PIXELS;
 306     uint8_t  *p_u   = p_src->U_PIXELS;
 307     uint8_t  *p_v   = p_src->V_PIXELS;
 308
 309     bool  b_hscale;                         /* horizontal scaling type */
 310     unsigned int i_vscale;                          /* vertical scaling type */
 311     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 312
 313     int         i_right_margin;
 314     int         i_rewind;
 315     int         i_scale_count;                       /* scale modulo counter */
 316     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 317     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 318
 319     /* Conversion buffer pointer */
 320     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 321     uint16_t *  p_buffer;
 322
 323     /* Offset array pointer */
 324     int *       p_offset_start = p_filter->p_sys->p_offset;
 325     int *       p_offset;
 326
 327     const int i_source_margin = p_src->p[0].i_pitch
 328                                  - p_src->p[0].i_visible_pitch;
 329     const int i_source_margin_c = p_src->p[1].i_pitch
 330                                  - p_src->p[1].i_visible_pitch;
 331
 332     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 333
 334     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 335      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 336      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 337     SetOffset( p_filter->fmt_in.video.i_width,
 338                p_filter->fmt_in.video.i_height,
 339                p_filter->fmt_out.video.i_width,
 340                p_filter->fmt_out.video.i_height,
 341                &b_hscale, &i_vscale, p_offset_start );
 342
 343
 344     /*
 345      * Perform conversion
 346      */
 347     i_scale_count = ( i_vscale == 1 ) ?
 348                     p_filter->fmt_out.video.i_height :
 349                     p_filter->fmt_in.video.i_height;
 350
 351 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 352
 353     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 354
 355     /*
 356     ** SSE2 128 bits fetch/store instructions are faster
 357     ** if memory access is 16 bytes aligned
 358     */
 359
 360     p_buffer = b_hscale ? p_buffer_start : p_pic;
 361     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 362                     p_dest->p->i_pitch|
 363                     ((intptr_t)p_y)|
 364                     ((intptr_t)p_buffer))) )
 365     {
 366         /* use faster SSE2 aligned fetch and store */
 367         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 368         {
 369             p_pic_start = p_pic;
 370
 371             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 372             {
 373                 SSE2_CALL (
 374                     SSE2_INIT_16_ALIGNED
 375                     SSE2_YUV_MUL
 376                     SSE2_YUV_ADD
 377                     SSE2_UNPACK_15_ALIGNED
 378                 );
 379                 p_y += 16;
 380                 p_u += 8;
 381                 p_v += 8;
 382                 p_buffer += 16;
 383             }
 384             /* Here we do some unaligned reads and duplicate conversions, but
 385              * at least we have all the pixels */
 386             if( i_rewind )
 387             {
 388                 p_y -= i_rewind;
 389                 p_u -= i_rewind >> 1;
 390                 p_v -= i_rewind >> 1;
 391                 p_buffer -= i_rewind;
 392
 393                 SSE2_CALL (
 394                     SSE2_INIT_16_UNALIGNED
 395                     SSE2_YUV_MUL
 396                     SSE2_YUV_ADD
 397                     SSE2_UNPACK_15_UNALIGNED
 398                 );
 399                 p_y += 16;
 400                 p_u += 8;
 401                 p_v += 8;
 402             }
 403             SCALE_WIDTH;
 404             SCALE_HEIGHT( 420, 2 );
 405
 406             p_y += i_source_margin;
 407             if( i_y % 2 )
 408             {
 409                 p_u += i_source_margin_c;
 410                 p_v += i_source_margin_c;
 411             }
 412             p_buffer = b_hscale ? p_buffer_start : p_pic;
 413         }
 414     }
 415     else
 416     {
 417         /* use slower SSE2 unaligned fetch and store */
 418         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 419         {
 420             p_pic_start = p_pic;
 421             p_buffer = b_hscale ? p_buffer_start : p_pic;
 422
 423             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 424             {
 425                 SSE2_CALL (
 426                     SSE2_INIT_16_UNALIGNED
 427                     SSE2_YUV_MUL
 428                     SSE2_YUV_ADD
 429                     SSE2_UNPACK_15_UNALIGNED
 430                 );
 431                 p_y += 16;
 432                 p_u += 8;
 433                 p_v += 8;
 434                 p_buffer += 16;
 435             }
 436             /* Here we do some unaligned reads and duplicate conversions, but
 437              * at least we have all the pixels */
 438             if( i_rewind )
 439             {
 440                 p_y -= i_rewind;
 441                 p_u -= i_rewind >> 1;
 442                 p_v -= i_rewind >> 1;
 443                 p_buffer -= i_rewind;
 444
 445                 SSE2_CALL (
 446                     SSE2_INIT_16_UNALIGNED
 447                     SSE2_YUV_MUL
 448                     SSE2_YUV_ADD
 449                     SSE2_UNPACK_15_UNALIGNED
 450                 );
 451                 p_y += 16;
 452                 p_u += 8;
 453                 p_v += 8;
 454             }
 455             SCALE_WIDTH;
 456             SCALE_HEIGHT( 420, 2 );
 457
 458             p_y += i_source_margin;
 459             if( i_y % 2 )
 460             {
 461                 p_u += i_source_margin_c;
 462                 p_v += i_source_margin_c;
 463             }
 464             p_buffer = b_hscale ? p_buffer_start : p_pic;
 465         }
 466     }
 467
 468     /* make sure all SSE2 stores are visible thereafter */
 469     SSE2_END;
 470
 471 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 472
 473     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 474
 475     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 476     {
 477         p_pic_start = p_pic;
 478         p_buffer = b_hscale ? p_buffer_start : p_pic;
 479
 480         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 481         {
 482             MMX_CALL (
 483                 MMX_INIT_16
 484                 MMX_YUV_MUL
 485                 MMX_YUV_ADD
 486                 MMX_UNPACK_15
 487             );
 488             p_y += 8;
 489             p_u += 4;
 490             p_v += 4;
 491             p_buffer += 8;
 492         }
 493
 494         /* Here we do some unaligned reads and duplicate conversions, but
 495          * at least we have all the pixels */
 496         if( i_rewind )
 497         {
 498             p_y -= i_rewind;
 499             p_u -= i_rewind >> 1;
 500             p_v -= i_rewind >> 1;
 501             p_buffer -= i_rewind;
 502
 503             MMX_CALL (
 504                 MMX_INIT_16
 505                 MMX_YUV_MUL
 506                 MMX_YUV_ADD
 507                 MMX_UNPACK_15
 508             );
 509             p_y += 8;
 510             p_u += 4;
 511             p_v += 4;
 512             p_buffer += 8;
 513         }
 514         SCALE_WIDTH;
 515         SCALE_HEIGHT( 420, 2 );
 516
 517         p_y += i_source_margin;
 518         if( i_y % 2 )
 519         {
 520             p_u += i_source_margin_c;
 521             p_v += i_source_margin_c;
 522         }
 523     }
 524     /* re-enable FPU registers */
 525     MMX_END;
 526
 527 #endif
 528 }
 529
 530 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src,
 531                                           picture_t *p_dest )
 532 {
 533     /* We got this one from the old arguments */
 534     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 535     uint8_t  *p_y   = p_src->Y_PIXELS;
 536     uint8_t  *p_u   = p_src->U_PIXELS;
 537     uint8_t  *p_v   = p_src->V_PIXELS;
 538
 539     bool  b_hscale;                         /* horizontal scaling type */
 540     unsigned int i_vscale;                          /* vertical scaling type */
 541     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 542
 543     int         i_right_margin;
 544     int         i_rewind;
 545     int         i_scale_count;                       /* scale modulo counter */
 546     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 547     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 548
 549     /* Conversion buffer pointer */
 550     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 551     uint16_t *  p_buffer;
 552
 553     /* Offset array pointer */
 554     int *       p_offset_start = p_filter->p_sys->p_offset;
 555     int *       p_offset;
 556
 557     const int i_source_margin = p_src->p[0].i_pitch
 558                                  - p_src->p[0].i_visible_pitch;
 559     const int i_source_margin_c = p_src->p[1].i_pitch
 560                                  - p_src->p[1].i_visible_pitch;
 561
 562     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 563
 564     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 565      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 566      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 567     SetOffset( p_filter->fmt_in.video.i_width,
 568                p_filter->fmt_in.video.i_height,
 569                p_filter->fmt_out.video.i_width,
 570                p_filter->fmt_out.video.i_height,
 571                &b_hscale, &i_vscale, p_offset_start );
 572
 573
 574     /*
 575      * Perform conversion
 576      */
 577     i_scale_count = ( i_vscale == 1 ) ?
 578                     p_filter->fmt_out.video.i_height :
 579                     p_filter->fmt_in.video.i_height;
 580
 581 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 582
 583     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 584
 585     /*
 586     ** SSE2 128 bits fetch/store instructions are faster
 587     ** if memory access is 16 bytes aligned
 588     */
 589
 590     p_buffer = b_hscale ? p_buffer_start : p_pic;
 591     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 592                     p_dest->p->i_pitch|
 593                     ((intptr_t)p_y)|
 594                     ((intptr_t)p_buffer))) )
 595     {
 596         /* use faster SSE2 aligned fetch and store */
 597         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 598         {
 599             p_pic_start = p_pic;
 600
 601             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 602             {
 603                 SSE2_CALL (
 604                     SSE2_INIT_16_ALIGNED
 605                     SSE2_YUV_MUL
 606                     SSE2_YUV_ADD
 607                     SSE2_UNPACK_16_ALIGNED
 608                 );
 609                 p_y += 16;
 610                 p_u += 8;
 611                 p_v += 8;
 612                 p_buffer += 16;
 613             }
 614             /* Here we do some unaligned reads and duplicate conversions, but
 615              * at least we have all the pixels */
 616             if( i_rewind )
 617             {
 618                 p_y -= i_rewind;
 619                 p_u -= i_rewind >> 1;
 620                 p_v -= i_rewind >> 1;
 621                 p_buffer -= i_rewind;
 622
 623                 SSE2_CALL (
 624                     SSE2_INIT_16_UNALIGNED
 625                     SSE2_YUV_MUL
 626                     SSE2_YUV_ADD
 627                     SSE2_UNPACK_16_UNALIGNED
 628                 );
 629                 p_y += 16;
 630                 p_u += 8;
 631                 p_v += 8;
 632             }
 633             SCALE_WIDTH;
 634             SCALE_HEIGHT( 420, 2 );
 635
 636             p_y += i_source_margin;
 637             if( i_y % 2 )
 638             {
 639                 p_u += i_source_margin_c;
 640                 p_v += i_source_margin_c;
 641             }
 642             p_buffer = b_hscale ? p_buffer_start : p_pic;
 643         }
 644     }
 645     else
 646     {
 647         /* use slower SSE2 unaligned fetch and store */
 648         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 649         {
 650             p_pic_start = p_pic;
 651             p_buffer = b_hscale ? p_buffer_start : p_pic;
 652
 653             for ( i_x = p_filter->fmt_in.video.i_width/16; i_x--; )
 654             {
 655                 SSE2_CALL(
 656                     SSE2_INIT_16_UNALIGNED
 657                     SSE2_YUV_MUL
 658                     SSE2_YUV_ADD
 659                     SSE2_UNPACK_16_UNALIGNED
 660                 );
 661                 p_y += 16;
 662                 p_u += 8;
 663                 p_v += 8;
 664                 p_buffer += 16;
 665             }
 666             /* Here we do some unaligned reads and duplicate conversions, but
 667              * at least we have all the pixels */
 668             if( i_rewind )
 669             {
 670                 p_y -= i_rewind;
 671                 p_u -= i_rewind >> 1;
 672                 p_v -= i_rewind >> 1;
 673                 p_buffer -= i_rewind;
 674
 675                 SSE2_CALL(
 676                     SSE2_INIT_16_UNALIGNED
 677                     SSE2_YUV_MUL
 678                     SSE2_YUV_ADD
 679                     SSE2_UNPACK_16_UNALIGNED
 680                 );
 681                 p_y += 16;
 682                 p_u += 8;
 683                 p_v += 8;
 684             }
 685             SCALE_WIDTH;
 686             SCALE_HEIGHT( 420, 2 );
 687
 688             p_y += i_source_margin;
 689             if( i_y % 2 )
 690             {
 691                 p_u += i_source_margin_c;
 692                 p_v += i_source_margin_c;
 693             }
 694             p_buffer = b_hscale ? p_buffer_start : p_pic;
 695         }
 696     }
 697
 698     /* make sure all SSE2 stores are visible thereafter */
 699     SSE2_END;
 700
 701 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
 702
 703     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 704
 705     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 706     {
 707         p_pic_start = p_pic;
 708         p_buffer = b_hscale ? p_buffer_start : p_pic;
 709
 710         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 711         {
 712             MMX_CALL (
 713                 MMX_INIT_16
 714                 MMX_YUV_MUL
 715                 MMX_YUV_ADD
 716                 MMX_UNPACK_16
 717             );
 718             p_y += 8;
 719             p_u += 4;
 720             p_v += 4;
 721             p_buffer += 8;
 722         }
 723
 724         /* Here we do some unaligned reads and duplicate conversions, but
 725          * at least we have all the pixels */
 726         if( i_rewind )
 727         {
 728             p_y -= i_rewind;
 729             p_u -= i_rewind >> 1;
 730             p_v -= i_rewind >> 1;
 731             p_buffer -= i_rewind;
 732
 733             MMX_CALL (
 734                 MMX_INIT_16
 735                 MMX_YUV_MUL
 736                 MMX_YUV_ADD
 737                 MMX_UNPACK_16
 738             );
 739             p_y += 8;
 740             p_u += 4;
 741             p_v += 4;
 742             p_buffer += 8;
 743         }
 744         SCALE_WIDTH;
 745         SCALE_HEIGHT( 420, 2 );
 746
 747         p_y += i_source_margin;
 748         if( i_y % 2 )
 749         {
 750             p_u += i_source_margin_c;
 751             p_v += i_source_margin_c;
 752         }
 753     }
 754     /* re-enable FPU registers */
 755     MMX_END;
 756
 757 #endif
 758 }
 759
 760 #endif
 761
 762 /*****************************************************************************
 763  * I420_RGB32: color YUV 4:2:0 to RGB 32 bpp
 764  *****************************************************************************
 765  * Horizontal alignment needed:
 766  *  - input: 8 pixels (8 Y bytes, 4 U/V bytes), margins not allowed
 767  *  - output: 1 pixel (2 bytes), margins allowed
 768  * Vertical alignment needed:
 769  *  - input: 2 lines (2 Y lines, 1 U/V line)
 770  *  - output: 1 line
 771  *****************************************************************************/
 772
 773 #if defined (MODULE_NAME_IS_i420_rgb)
 774
 775 void I420_RGB32( filter_t *p_filter, picture_t *p_src,
 776                                          picture_t *p_dest )
 777 {
 778     /* We got this one from the old arguments */
 779     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 780     uint8_t  *p_y   = p_src->Y_PIXELS;
 781     uint8_t  *p_u   = p_src->U_PIXELS;
 782     uint8_t  *p_v   = p_src->V_PIXELS;
 783
 784     bool  b_hscale;                         /* horizontal scaling type */
 785     unsigned int i_vscale;                          /* vertical scaling type */
 786     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 787
 788     int         i_right_margin;
 789     int         i_rewind;
 790     int         i_scale_count;                       /* scale modulo counter */
 791     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 792     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 793     int         i_uval, i_vval;                           /* U and V samples */
 794     int         i_red, i_green, i_blue;          /* U and V modified samples */
 795     uint32_t *  p_yuv = p_filter->p_sys->p_rgb32;
 796     uint32_t *  p_ybase;                     /* Y dependant conversion table */
 797
 798     /* Conversion buffer pointer */
 799     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 800     uint32_t *  p_buffer;
 801
 802     /* Offset array pointer */
 803     int *       p_offset_start = p_filter->p_sys->p_offset;
 804     int *       p_offset;
 805
 806     const int i_source_margin = p_src->p[0].i_pitch
 807                                  - p_src->p[0].i_visible_pitch;
 808     const int i_source_margin_c = p_src->p[1].i_pitch
 809                                  - p_src->p[1].i_visible_pitch;
 810
 811     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 812     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
 813
 814     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 815      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 816      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 817     SetOffset( p_filter->fmt_in.video.i_width,
 818                p_filter->fmt_in.video.i_height,
 819                p_filter->fmt_out.video.i_width,
 820                p_filter->fmt_out.video.i_height,
 821                &b_hscale, &i_vscale, p_offset_start );
 822
 823     /*
 824      * Perform conversion
 825      */
 826     i_scale_count = ( i_vscale == 1 ) ?
 827                     p_filter->fmt_out.video.i_height :
 828                     p_filter->fmt_in.video.i_height;
 829     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 830     {
 831         p_pic_start = p_pic;
 832         p_buffer = b_hscale ? p_buffer_start : p_pic;
 833
 834         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
 835         {
 836             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 837             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 838             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 839             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 840         }
 841
 842         /* Here we do some unaligned reads and duplicate conversions, but
 843          * at least we have all the pixels */
 844         if( i_rewind )
 845         {
 846             p_y -= i_rewind;
 847             p_u -= i_rewind >> 1;
 848             p_v -= i_rewind >> 1;
 849             p_buffer -= i_rewind;
 850             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 851             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 852             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 853             CONVERT_YUV_PIXEL(4);  CONVERT_Y_PIXEL(4);
 854         }
 855         SCALE_WIDTH;
 856         SCALE_HEIGHT( 420, 4 );
 857
 858         p_y += i_source_margin;
 859         if( i_y % 2 )
 860         {
 861             p_u += i_source_margin_c;
 862             p_v += i_source_margin_c;
 863         }
 864     }
 865 }
 866
 867 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) || defined (MODULE_NAME_IS_i420_rgb_sse2)
 868
 869 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 870                                             picture_t *p_dest )
 871 {
 872     /* We got this one from the old arguments */
 873     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 874     uint8_t  *p_y   = p_src->Y_PIXELS;
 875     uint8_t  *p_u   = p_src->U_PIXELS;
 876     uint8_t  *p_v   = p_src->V_PIXELS;
 877
 878     bool  b_hscale;                         /* horizontal scaling type */
 879     unsigned int i_vscale;                          /* vertical scaling type */
 880     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 881
 882     int         i_right_margin;
 883     int         i_rewind;
 884     int         i_scale_count;                       /* scale modulo counter */
 885     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
 886     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 887     /* Conversion buffer pointer */
 888     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 889     uint32_t *  p_buffer;
 890
 891     /* Offset array pointer */
 892     int *       p_offset_start = p_filter->p_sys->p_offset;
 893     int *       p_offset;
 894
 895     const int i_source_margin = p_src->p[0].i_pitch
 896                                  - p_src->p[0].i_visible_pitch;
 897     const int i_source_margin_c = p_src->p[1].i_pitch
 898                                  - p_src->p[1].i_visible_pitch;
 899
 900     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 901
 902     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 903      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 904      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 905     SetOffset( p_filter->fmt_in.video.i_width,
 906                p_filter->fmt_in.video.i_height,
 907                p_filter->fmt_out.video.i_width,
 908                p_filter->fmt_out.video.i_height,
 909                &b_hscale, &i_vscale, p_offset_start );
 910
 911     /*
 912      * Perform conversion
 913      */
 914     i_scale_count = ( i_vscale == 1 ) ?
 915                     p_filter->fmt_out.video.i_height :
 916                     p_filter->fmt_in.video.i_height;
 917
 918 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
 919
 920     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
 921
 922     /*
 923     ** SSE2 128 bits fetch/store instructions are faster
 924     ** if memory access is 16 bytes aligned
 925     */
 926
 927     p_buffer = b_hscale ? p_buffer_start : p_pic;
 928     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 929                     p_dest->p->i_pitch|
 930                     ((intptr_t)p_y)|
 931                     ((intptr_t)p_buffer))) )
 932     {
 933         /* use faster SSE2 aligned fetch and store */
 934         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 935         {
 936             p_pic_start = p_pic;
 937
 938             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 939             {
 940                 SSE2_CALL (
 941                     SSE2_INIT_32_ALIGNED
 942                     SSE2_YUV_MUL
 943                     SSE2_YUV_ADD
 944                     SSE2_UNPACK_32_ARGB_ALIGNED
 945                 );
 946                 p_y += 16;
 947                 p_u += 8;
 948                 p_v += 8;
 949                 p_buffer += 16;
 950             }
 951
 952             /* Here we do some unaligned reads and duplicate conversions, but
 953              * at least we have all the pixels */
 954             if( i_rewind )
 955             {
 956                 p_y -= i_rewind;
 957                 p_u -= i_rewind >> 1;
 958                 p_v -= i_rewind >> 1;
 959                 p_buffer -= i_rewind;
 960                 SSE2_CALL (
 961                     SSE2_INIT_32_UNALIGNED
 962                     SSE2_YUV_MUL
 963                     SSE2_YUV_ADD
 964                     SSE2_UNPACK_32_ARGB_UNALIGNED
 965                 );
 966                 p_y += 16;
 967                 p_u += 4;
 968                 p_v += 4;
 969             }
 970             SCALE_WIDTH;
 971             SCALE_HEIGHT( 420, 4 );
 972
 973             p_y += i_source_margin;
 974             if( i_y % 2 )
 975             {
 976                 p_u += i_source_margin_c;
 977                 p_v += i_source_margin_c;
 978             }
 979             p_buffer = b_hscale ? p_buffer_start : p_pic;
 980         }
 981     }
 982     else
 983     {
 984         /* use slower SSE2 unaligned fetch and store */
 985         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
 986         {
 987             p_pic_start = p_pic;
 988             p_buffer = b_hscale ? p_buffer_start : p_pic;
 989
 990             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
 991             {
 992                 SSE2_CALL (
 993                     SSE2_INIT_32_UNALIGNED
 994                     SSE2_YUV_MUL
 995                     SSE2_YUV_ADD
 996                     SSE2_UNPACK_32_ARGB_UNALIGNED
 997                 );
 998                 p_y += 16;
 999                 p_u += 8;
1000                 p_v += 8;
1001                 p_buffer += 16;
1002             }
1003
1004             /* Here we do some unaligned reads and duplicate conversions, but
1005              * at least we have all the pixels */
1006             if( i_rewind )
1007             {
1008                 p_y -= i_rewind;
1009                 p_u -= i_rewind >> 1;
1010                 p_v -= i_rewind >> 1;
1011                 p_buffer -= i_rewind;
1012                 SSE2_CALL (
1013                     SSE2_INIT_32_UNALIGNED
1014                     SSE2_YUV_MUL
1015                     SSE2_YUV_ADD
1016                     SSE2_UNPACK_32_ARGB_UNALIGNED
1017                 );
1018                 p_y += 16;
1019                 p_u += 8;
1020                 p_v += 8;
1021             }
1022             SCALE_WIDTH;
1023             SCALE_HEIGHT( 420, 4 );
1024
1025             p_y += i_source_margin;
1026             if( i_y % 2 )
1027             {
1028                 p_u += i_source_margin_c;
1029                 p_v += i_source_margin_c;
1030             }
1031             p_buffer = b_hscale ? p_buffer_start : p_pic;
1032         }
1033     }
1034
1035     /* make sure all SSE2 stores are visible thereafter */
1036     SSE2_END;
1037
1038 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1039
1040     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1041
1042     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1043     {
1044         p_pic_start = p_pic;
1045         p_buffer = b_hscale ? p_buffer_start : p_pic;
1046
1047         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1048         {
1049             MMX_CALL (
1050                 MMX_INIT_32
1051                 MMX_YUV_MUL
1052                 MMX_YUV_ADD
1053                 MMX_UNPACK_32_ARGB
1054             );
1055             p_y += 8;
1056             p_u += 4;
1057             p_v += 4;
1058             p_buffer += 8;
1059         }
1060
1061         /* Here we do some unaligned reads and duplicate conversions, but
1062          * at least we have all the pixels */
1063         if( i_rewind )
1064         {
1065             p_y -= i_rewind;
1066             p_u -= i_rewind >> 1;
1067             p_v -= i_rewind >> 1;
1068             p_buffer -= i_rewind;
1069             MMX_CALL (
1070                 MMX_INIT_32
1071                 MMX_YUV_MUL
1072                 MMX_YUV_ADD
1073                 MMX_UNPACK_32_ARGB
1074             );
1075             p_y += 8;
1076             p_u += 4;
1077             p_v += 4;
1078             p_buffer += 8;
1079         }
1080         SCALE_WIDTH;
1081         SCALE_HEIGHT( 420, 4 );
1082
1083         p_y += i_source_margin;
1084         if( i_y % 2 )
1085         {
1086             p_u += i_source_margin_c;
1087             p_v += i_source_margin_c;
1088         }
1089     }
1090
1091     /* re-enable FPU registers */
1092     MMX_END;
1093
1094 #endif
1095 }
1096
1097 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src,
1098                                             picture_t *p_dest )
1099 {
1100     /* We got this one from the old arguments */
1101     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1102     uint8_t  *p_y   = p_src->Y_PIXELS;
1103     uint8_t  *p_u   = p_src->U_PIXELS;
1104     uint8_t  *p_v   = p_src->V_PIXELS;
1105
1106     bool  b_hscale;                         /* horizontal scaling type */
1107     unsigned int i_vscale;                          /* vertical scaling type */
1108     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1109
1110     int         i_right_margin;
1111     int         i_rewind;
1112     int         i_scale_count;                       /* scale modulo counter */
1113     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1114     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1115     /* Conversion buffer pointer */
1116     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1117     uint32_t *  p_buffer;
1118
1119     /* Offset array pointer */
1120     int *       p_offset_start = p_filter->p_sys->p_offset;
1121     int *       p_offset;
1122
1123     const int i_source_margin = p_src->p[0].i_pitch
1124                                  - p_src->p[0].i_visible_pitch;
1125     const int i_source_margin_c = p_src->p[1].i_pitch
1126                                  - p_src->p[1].i_visible_pitch;
1127
1128     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1129
1130     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1131      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1132      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1133     SetOffset( p_filter->fmt_in.video.i_width,
1134                p_filter->fmt_in.video.i_height,
1135                p_filter->fmt_out.video.i_width,
1136                p_filter->fmt_out.video.i_height,
1137                &b_hscale, &i_vscale, p_offset_start );
1138
1139     /*
1140      * Perform conversion
1141      */
1142     i_scale_count = ( i_vscale == 1 ) ?
1143                     p_filter->fmt_out.video.i_height :
1144                     p_filter->fmt_in.video.i_height;
1145
1146 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1147
1148     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1149
1150     /*
1151     ** SSE2 128 bits fetch/store instructions are faster
1152     ** if memory access is 16 bytes aligned
1153     */
1154
1155     p_buffer = b_hscale ? p_buffer_start : p_pic;
1156     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1157                     p_dest->p->i_pitch|
1158                     ((intptr_t)p_y)|
1159                     ((intptr_t)p_buffer))) )
1160     {
1161         /* use faster SSE2 aligned fetch and store */
1162         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1163         {
1164             p_pic_start = p_pic;
1165
1166             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1167             {
1168                 SSE2_CALL (
1169                     SSE2_INIT_32_ALIGNED
1170                     SSE2_YUV_MUL
1171                     SSE2_YUV_ADD
1172                     SSE2_UNPACK_32_RGBA_ALIGNED
1173                 );
1174                 p_y += 16;
1175                 p_u += 8;
1176                 p_v += 8;
1177                 p_buffer += 16;
1178             }
1179
1180             /* Here we do some unaligned reads and duplicate conversions, but
1181              * at least we have all the pixels */
1182             if( i_rewind )
1183             {
1184                 p_y -= i_rewind;
1185                 p_u -= i_rewind >> 1;
1186                 p_v -= i_rewind >> 1;
1187                 p_buffer -= i_rewind;
1188                 SSE2_CALL (
1189                     SSE2_INIT_32_UNALIGNED
1190                     SSE2_YUV_MUL
1191                     SSE2_YUV_ADD
1192                     SSE2_UNPACK_32_RGBA_UNALIGNED
1193                 );
1194                 p_y += 16;
1195                 p_u += 4;
1196                 p_v += 4;
1197             }
1198             SCALE_WIDTH;
1199             SCALE_HEIGHT( 420, 4 );
1200
1201             p_y += i_source_margin;
1202             if( i_y % 2 )
1203             {
1204                 p_u += i_source_margin_c;
1205                 p_v += i_source_margin_c;
1206             }
1207             p_buffer = b_hscale ? p_buffer_start : p_pic;
1208         }
1209     }
1210     else
1211     {
1212         /* use slower SSE2 unaligned fetch and store */
1213         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1214         {
1215             p_pic_start = p_pic;
1216             p_buffer = b_hscale ? p_buffer_start : p_pic;
1217
1218             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1219             {
1220                 SSE2_CALL (
1221                     SSE2_INIT_32_UNALIGNED
1222                     SSE2_YUV_MUL
1223                     SSE2_YUV_ADD
1224                     SSE2_UNPACK_32_RGBA_UNALIGNED
1225                 );
1226                 p_y += 16;
1227                 p_u += 8;
1228                 p_v += 8;
1229                 p_buffer += 16;
1230             }
1231
1232             /* Here we do some unaligned reads and duplicate conversions, but
1233              * at least we have all the pixels */
1234             if( i_rewind )
1235             {
1236                 p_y -= i_rewind;
1237                 p_u -= i_rewind >> 1;
1238                 p_v -= i_rewind >> 1;
1239                 p_buffer -= i_rewind;
1240                 SSE2_CALL (
1241                     SSE2_INIT_32_UNALIGNED
1242                     SSE2_YUV_MUL
1243                     SSE2_YUV_ADD
1244                     SSE2_UNPACK_32_RGBA_UNALIGNED
1245                 );
1246                 p_y += 16;
1247                 p_u += 8;
1248                 p_v += 8;
1249             }
1250             SCALE_WIDTH;
1251             SCALE_HEIGHT( 420, 4 );
1252
1253             p_y += i_source_margin;
1254             if( i_y % 2 )
1255             {
1256                 p_u += i_source_margin_c;
1257                 p_v += i_source_margin_c;
1258             }
1259             p_buffer = b_hscale ? p_buffer_start : p_pic;
1260         }
1261     }
1262
1263     /* make sure all SSE2 stores are visible thereafter */
1264     SSE2_END;
1265
1266 #else // defined (MODULE_NAME_IS_i420_rgb_mmx)
1267
1268     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1269
1270     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1271     {
1272         p_pic_start = p_pic;
1273         p_buffer = b_hscale ? p_buffer_start : p_pic;
1274
1275         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1276         {
1277             MMX_CALL (
1278                 MMX_INIT_32
1279                 MMX_YUV_MUL
1280                 MMX_YUV_ADD
1281                 MMX_UNPACK_32_RGBA
1282             );
1283             p_y += 8;
1284             p_u += 4;
1285             p_v += 4;
1286             p_buffer += 8;
1287         }
1288
1289         /* Here we do some unaligned reads and duplicate conversions, but
1290          * at least we have all the pixels */
1291         if( i_rewind )
1292         {
1293             p_y -= i_rewind;
1294             p_u -= i_rewind >> 1;
1295             p_v -= i_rewind >> 1;
1296             p_buffer -= i_rewind;
1297             MMX_CALL (
1298                 MMX_INIT_32
1299                 MMX_YUV_MUL
1300                 MMX_YUV_ADD
1301                 MMX_UNPACK_32_RGBA
1302             );
1303             p_y += 8;
1304             p_u += 4;
1305             p_v += 4;
1306             p_buffer += 8;
1307         }
1308         SCALE_WIDTH;
1309         SCALE_HEIGHT( 420, 4 );
1310
1311         p_y += i_source_margin;
1312         if( i_y % 2 )
1313         {
1314             p_u += i_source_margin_c;
1315             p_v += i_source_margin_c;
1316         }
1317     }
1318
1319     /* re-enable FPU registers */
1320     MMX_END;
1321
1322 #endif
1323 }
1324
1325 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src,
1326                                             picture_t *p_dest )
1327 {
1328     /* We got this one from the old arguments */
1329     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1330     uint8_t  *p_y   = p_src->Y_PIXELS;
1331     uint8_t  *p_u   = p_src->U_PIXELS;
1332     uint8_t  *p_v   = p_src->V_PIXELS;
1333
1334     bool  b_hscale;                         /* horizontal scaling type */
1335     unsigned int i_vscale;                          /* vertical scaling type */
1336     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1337
1338     int         i_right_margin;
1339     int         i_rewind;
1340     int         i_scale_count;                       /* scale modulo counter */
1341     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1342     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1343     /* Conversion buffer pointer */
1344     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1345     uint32_t *  p_buffer;
1346
1347     /* Offset array pointer */
1348     int *       p_offset_start = p_filter->p_sys->p_offset;
1349     int *       p_offset;
1350
1351     const int i_source_margin = p_src->p[0].i_pitch
1352                                  - p_src->p[0].i_visible_pitch;
1353     const int i_source_margin_c = p_src->p[1].i_pitch
1354                                  - p_src->p[1].i_visible_pitch;
1355
1356     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1357
1358     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1359      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1360      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1361     SetOffset( p_filter->fmt_in.video.i_width,
1362                p_filter->fmt_in.video.i_height,
1363                p_filter->fmt_out.video.i_width,
1364                p_filter->fmt_out.video.i_height,
1365                &b_hscale, &i_vscale, p_offset_start );
1366
1367     /*
1368      * Perform conversion
1369      */
1370     i_scale_count = ( i_vscale == 1 ) ?
1371                     p_filter->fmt_out.video.i_height :
1372                     p_filter->fmt_in.video.i_height;
1373
1374 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1375
1376     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1377
1378     /*
1379     ** SSE2 128 bits fetch/store instructions are faster
1380     ** if memory access is 16 bytes aligned
1381     */
1382
1383     p_buffer = b_hscale ? p_buffer_start : p_pic;
1384     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1385                     p_dest->p->i_pitch|
1386                     ((intptr_t)p_y)|
1387                     ((intptr_t)p_buffer))) )
1388     {
1389         /* use faster SSE2 aligned fetch and store */
1390         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1391         {
1392             p_pic_start = p_pic;
1393
1394             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1395             {
1396                 SSE2_CALL (
1397                     SSE2_INIT_32_ALIGNED
1398                     SSE2_YUV_MUL
1399                     SSE2_YUV_ADD
1400                     SSE2_UNPACK_32_BGRA_ALIGNED
1401                 );
1402                 p_y += 16;
1403                 p_u += 8;
1404                 p_v += 8;
1405                 p_buffer += 16;
1406             }
1407
1408             /* Here we do some unaligned reads and duplicate conversions, but
1409              * at least we have all the pixels */
1410             if( i_rewind )
1411             {
1412                 p_y -= i_rewind;
1413                 p_u -= i_rewind >> 1;
1414                 p_v -= i_rewind >> 1;
1415                 p_buffer -= i_rewind;
1416                 SSE2_CALL (
1417                     SSE2_INIT_32_UNALIGNED
1418                     SSE2_YUV_MUL
1419                     SSE2_YUV_ADD
1420                     SSE2_UNPACK_32_BGRA_UNALIGNED
1421                 );
1422                 p_y += 16;
1423                 p_u += 4;
1424                 p_v += 4;
1425             }
1426             SCALE_WIDTH;
1427             SCALE_HEIGHT( 420, 4 );
1428
1429             p_y += i_source_margin;
1430             if( i_y % 2 )
1431             {
1432                 p_u += i_source_margin_c;
1433                 p_v += i_source_margin_c;
1434             }
1435             p_buffer = b_hscale ? p_buffer_start : p_pic;
1436         }
1437     }
1438     else
1439     {
1440         /* use slower SSE2 unaligned fetch and store */
1441         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1442         {
1443             p_pic_start = p_pic;
1444             p_buffer = b_hscale ? p_buffer_start : p_pic;
1445
1446             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1447             {
1448                 SSE2_CALL (
1449                     SSE2_INIT_32_UNALIGNED
1450                     SSE2_YUV_MUL
1451                     SSE2_YUV_ADD
1452                     SSE2_UNPACK_32_BGRA_UNALIGNED
1453                 );
1454                 p_y += 16;
1455                 p_u += 8;
1456                 p_v += 8;
1457                 p_buffer += 16;
1458             }
1459
1460             /* Here we do some unaligned reads and duplicate conversions, but
1461              * at least we have all the pixels */
1462             if( i_rewind )
1463             {
1464                 p_y -= i_rewind;
1465                 p_u -= i_rewind >> 1;
1466                 p_v -= i_rewind >> 1;
1467                 p_buffer -= i_rewind;
1468                 SSE2_CALL (
1469                     SSE2_INIT_32_UNALIGNED
1470                     SSE2_YUV_MUL
1471                     SSE2_YUV_ADD
1472                     SSE2_UNPACK_32_BGRA_UNALIGNED
1473                 );
1474                 p_y += 16;
1475                 p_u += 8;
1476                 p_v += 8;
1477             }
1478             SCALE_WIDTH;
1479             SCALE_HEIGHT( 420, 4 );
1480
1481             p_y += i_source_margin;
1482             if( i_y % 2 )
1483             {
1484                 p_u += i_source_margin_c;
1485                 p_v += i_source_margin_c;
1486             }
1487             p_buffer = b_hscale ? p_buffer_start : p_pic;
1488         }
1489     }
1490
1491 #else
1492
1493     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1494
1495     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1496     {
1497         p_pic_start = p_pic;
1498         p_buffer = b_hscale ? p_buffer_start : p_pic;
1499
1500         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1501         {
1502             MMX_CALL (
1503                 MMX_INIT_32
1504                 MMX_YUV_MUL
1505                 MMX_YUV_ADD
1506                 MMX_UNPACK_32_BGRA
1507             );
1508             p_y += 8;
1509             p_u += 4;
1510             p_v += 4;
1511             p_buffer += 8;
1512         }
1513
1514         /* Here we do some unaligned reads and duplicate conversions, but
1515          * at least we have all the pixels */
1516         if( i_rewind )
1517         {
1518             p_y -= i_rewind;
1519             p_u -= i_rewind >> 1;
1520             p_v -= i_rewind >> 1;
1521             p_buffer -= i_rewind;
1522             MMX_CALL (
1523                 MMX_INIT_32
1524                 MMX_YUV_MUL
1525                 MMX_YUV_ADD
1526                 MMX_UNPACK_32_BGRA
1527             );
1528             p_y += 8;
1529             p_u += 4;
1530             p_v += 4;
1531             p_buffer += 8;
1532         }
1533         SCALE_WIDTH;
1534         SCALE_HEIGHT( 420, 4 );
1535
1536         p_y += i_source_margin;
1537         if( i_y % 2 )
1538         {
1539             p_u += i_source_margin_c;
1540             p_v += i_source_margin_c;
1541         }
1542     }
1543
1544     /* re-enable FPU registers */
1545     MMX_END;
1546
1547 #endif
1548 }
1549
1550 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src,
1551                                             picture_t *p_dest )
1552 {
1553     /* We got this one from the old arguments */
1554     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1555     uint8_t  *p_y   = p_src->Y_PIXELS;
1556     uint8_t  *p_u   = p_src->U_PIXELS;
1557     uint8_t  *p_v   = p_src->V_PIXELS;
1558
1559     bool  b_hscale;                         /* horizontal scaling type */
1560     unsigned int i_vscale;                          /* vertical scaling type */
1561     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1562
1563     int         i_right_margin;
1564     int         i_rewind;
1565     int         i_scale_count;                       /* scale modulo counter */
1566     int         i_chroma_width = p_filter->fmt_in.video.i_width / 2; /* chroma width */
1567     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1568     /* Conversion buffer pointer */
1569     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1570     uint32_t *  p_buffer;
1571
1572     /* Offset array pointer */
1573     int *       p_offset_start = p_filter->p_sys->p_offset;
1574     int *       p_offset;
1575
1576     const int i_source_margin = p_src->p[0].i_pitch
1577                                  - p_src->p[0].i_visible_pitch;
1578     const int i_source_margin_c = p_src->p[1].i_pitch
1579                                  - p_src->p[1].i_visible_pitch;
1580
1581     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1582
1583     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1584      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1585      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1586     SetOffset( p_filter->fmt_in.video.i_width,
1587                p_filter->fmt_in.video.i_height,
1588                p_filter->fmt_out.video.i_width,
1589                p_filter->fmt_out.video.i_height,
1590                &b_hscale, &i_vscale, p_offset_start );
1591
1592     /*
1593      * Perform conversion
1594      */
1595     i_scale_count = ( i_vscale == 1 ) ?
1596                     p_filter->fmt_out.video.i_height :
1597                     p_filter->fmt_in.video.i_height;
1598
1599 #if defined (MODULE_NAME_IS_i420_rgb_sse2)
1600
1601     i_rewind = (-p_filter->fmt_in.video.i_width) & 15;
1602
1603     /*
1604     ** SSE2 128 bits fetch/store instructions are faster
1605     ** if memory access is 16 bytes aligned
1606     */
1607
1608     p_buffer = b_hscale ? p_buffer_start : p_pic;
1609     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1610                     p_dest->p->i_pitch|
1611                     ((intptr_t)p_y)|
1612                     ((intptr_t)p_buffer))) )
1613     {
1614         /* use faster SSE2 aligned fetch and store */
1615         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1616         {
1617             p_pic_start = p_pic;
1618
1619             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1620             {
1621                 SSE2_CALL (
1622                     SSE2_INIT_32_ALIGNED
1623                     SSE2_YUV_MUL
1624                     SSE2_YUV_ADD
1625                     SSE2_UNPACK_32_ABGR_ALIGNED
1626                 );
1627                 p_y += 16;
1628                 p_u += 8;
1629                 p_v += 8;
1630                 p_buffer += 16;
1631             }
1632
1633             /* Here we do some unaligned reads and duplicate conversions, but
1634              * at least we have all the pixels */
1635             if( i_rewind )
1636             {
1637                 p_y -= i_rewind;
1638                 p_u -= i_rewind >> 1;
1639                 p_v -= i_rewind >> 1;
1640                 p_buffer -= i_rewind;
1641                 SSE2_CALL (
1642                     SSE2_INIT_32_UNALIGNED
1643                     SSE2_YUV_MUL
1644                     SSE2_YUV_ADD
1645                     SSE2_UNPACK_32_ABGR_UNALIGNED
1646                 );
1647                 p_y += 16;
1648                 p_u += 4;
1649                 p_v += 4;
1650             }
1651             SCALE_WIDTH;
1652             SCALE_HEIGHT( 420, 4 );
1653
1654             p_y += i_source_margin;
1655             if( i_y % 2 )
1656             {
1657                 p_u += i_source_margin_c;
1658                 p_v += i_source_margin_c;
1659             }
1660             p_buffer = b_hscale ? p_buffer_start : p_pic;
1661         }
1662     }
1663     else
1664     {
1665         /* use slower SSE2 unaligned fetch and store */
1666         for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1667         {
1668             p_pic_start = p_pic;
1669             p_buffer = b_hscale ? p_buffer_start : p_pic;
1670
1671             for ( i_x = p_filter->fmt_in.video.i_width / 16; i_x--; )
1672             {
1673                 SSE2_CALL (
1674                     SSE2_INIT_32_UNALIGNED
1675                     SSE2_YUV_MUL
1676                     SSE2_YUV_ADD
1677                     SSE2_UNPACK_32_ABGR_UNALIGNED
1678                 );
1679                 p_y += 16;
1680                 p_u += 8;
1681                 p_v += 8;
1682                 p_buffer += 16;
1683             }
1684
1685             /* Here we do some unaligned reads and duplicate conversions, but
1686              * at least we have all the pixels */
1687             if( i_rewind )
1688             {
1689                 p_y -= i_rewind;
1690                 p_u -= i_rewind >> 1;
1691                 p_v -= i_rewind >> 1;
1692                 p_buffer -= i_rewind;
1693                 SSE2_CALL (
1694                     SSE2_INIT_32_UNALIGNED
1695                     SSE2_YUV_MUL
1696                     SSE2_YUV_ADD
1697                     SSE2_UNPACK_32_ABGR_UNALIGNED
1698                 );
1699                 p_y += 16;
1700                 p_u += 8;
1701                 p_v += 8;
1702             }
1703             SCALE_WIDTH;
1704             SCALE_HEIGHT( 420, 4 );
1705
1706             p_y += i_source_margin;
1707             if( i_y % 2 )
1708             {
1709                 p_u += i_source_margin_c;
1710                 p_v += i_source_margin_c;
1711             }
1712             p_buffer = b_hscale ? p_buffer_start : p_pic;
1713         }
1714     }
1715
1716 #else
1717
1718     i_rewind = (-p_filter->fmt_in.video.i_width) & 7;
1719
1720     for( i_y = 0; i_y < p_filter->fmt_in.video.i_height; i_y++ )
1721     {
1722         p_pic_start = p_pic;
1723         p_buffer = b_hscale ? p_buffer_start : p_pic;
1724
1725         for ( i_x = p_filter->fmt_in.video.i_width / 8; i_x--; )
1726         {
1727             MMX_CALL (
1728                 MMX_INIT_32
1729                 MMX_YUV_MUL
1730                 MMX_YUV_ADD
1731                 MMX_UNPACK_32_ABGR
1732             );
1733             p_y += 8;
1734             p_u += 4;
1735             p_v += 4;
1736             p_buffer += 8;
1737         }
1738
1739         /* Here we do some unaligned reads and duplicate conversions, but
1740          * at least we have all the pixels */
1741         if( i_rewind )
1742         {
1743             p_y -= i_rewind;
1744             p_u -= i_rewind >> 1;
1745             p_v -= i_rewind >> 1;
1746             p_buffer -= i_rewind;
1747             MMX_CALL (
1748                 MMX_INIT_32
1749                 MMX_YUV_MUL
1750                 MMX_YUV_ADD
1751                 MMX_UNPACK_32_ABGR
1752             );
1753             p_y += 8;
1754             p_u += 4;
1755             p_v += 4;
1756             p_buffer += 8;
1757         }
1758         SCALE_WIDTH;
1759         SCALE_HEIGHT( 420, 4 );
1760
1761         p_y += i_source_margin;
1762         if( i_y % 2 )
1763         {
1764             p_u += i_source_margin_c;
1765             p_v += i_source_margin_c;
1766         }
1767     }
1768
1769     /* re-enable FPU registers */
1770     MMX_END;
1771
1772 #endif
1773 }
1774
1775 #endif
1776
1777 /* Following functions are local */
1778
1779 /*****************************************************************************
1780  * SetOffset: build offset array for conversion functions
1781  *****************************************************************************
1782  * This function will build an offset array used in later conversion functions.
1783  * It will also set horizontal and vertical scaling indicators.
1784  *****************************************************************************/
1785 static void SetOffset( int i_width, int i_height, int i_pic_width,
1786                        int i_pic_height, bool *pb_hscale,
1787                        unsigned int *pi_vscale, int *p_offset )
1788 {
1789     int i_x;                                    /* x position in destination */
1790     int i_scale_count;                                     /* modulo counter */
1791
1792     /*
1793      * Prepare horizontal offset array
1794      */
1795     if( i_pic_width - i_width == 0 )
1796     {
1797         /* No horizontal scaling: YUV conversion is done directly to picture */
1798         *pb_hscale = 0;
1799     }
1800     else if( i_pic_width - i_width > 0 )
1801     {
1802         /* Prepare scaling array for horizontal extension */
1803         *pb_hscale = 1;
1804         i_scale_count = i_pic_width;
1805         for( i_x = i_width; i_x--; )
1806         {
1807             while( (i_scale_count -= i_width) > 0 )
1808             {
1809                 *p_offset++ = 0;
1810             }
1811             *p_offset++ = 1;
1812             i_scale_count += i_pic_width;
1813         }
1814     }
1815     else /* if( i_pic_width - i_width < 0 ) */
1816     {
1817         /* Prepare scaling array for horizontal reduction */
1818         *pb_hscale = 1;
1819         i_scale_count = i_width;
1820         for( i_x = i_pic_width; i_x--; )
1821         {
1822             *p_offset = 1;
1823             while( (i_scale_count -= i_pic_width) > 0 )
1824             {
1825                 *p_offset += 1;
1826             }
1827             p_offset++;
1828             i_scale_count += i_width;
1829         }
1830     }
1831
1832     /*
1833      * Set vertical scaling indicator
1834      */
1835     if( i_pic_height - i_height == 0 )
1836     {
1837         *pi_vscale = 0;
1838     }
1839     else if( i_pic_height - i_height > 0 )
1840     {
1841         *pi_vscale = 1;
1842     }
1843     else /* if( i_pic_height - i_height < 0 ) */
1844     {
1845         *pi_vscale = -1;
1846     }
1847 }
1848