modules/video_chroma/i420_rgb16_x86.c

   1 /*****************************************************************************
   2  * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 VLC authors and VideoLAN
   5  *
   6  * Authors: Samuel Hocevar <sam@zoy.org>
   7  *          Damien Fouilleul <damienf@videolan.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU Lesser General Public License as published by
  11  * the Free Software Foundation; either version 2.1 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17  * GNU Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public License
  20  * along with this program; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  22  *****************************************************************************/
  23
  24 #ifdef HAVE_CONFIG_H
  25 # include "config.h"
  26 #endif
  27
  28 #include <vlc_common.h>
  29 #include <vlc_filter.h>
  30 #include <vlc_picture.h>
  31 #include <vlc_cpu.h>
  32
  33 #include "i420_rgb.h"
  34 #ifdef SSE2
  35 # include "i420_rgb_sse2.h"
  36 # define VLC_TARGET VLC_SSE
  37 #else
  38 # include "i420_rgb_mmx.h"
  39 # define VLC_TARGET VLC_MMX
  40 #endif
  41
  42 /*****************************************************************************
  43  * SetOffset: build offset array for conversion functions
  44  *****************************************************************************
  45  * This function will build an offset array used in later conversion functions.
  46  * It will also set horizontal and vertical scaling indicators.
  47  *****************************************************************************/
  48 static void SetOffset( int i_width, int i_height, int i_pic_width,
  49                        int i_pic_height, bool *pb_hscale,
  50                        unsigned int *pi_vscale, int *p_offset )
  51 {
  52     /*
  53      * Prepare horizontal offset array
  54      */
  55     if( i_pic_width - i_width == 0 )
  56     {   /* No horizontal scaling: YUV conversion is done directly to picture */
  57         *pb_hscale = 0;
  58     }
  59     else if( i_pic_width - i_width > 0 )
  60     {   /* Prepare scaling array for horizontal extension */
  61         int i_scale_count = i_pic_width;
  62
  63         *pb_hscale = 1;
  64         for( int i_x = i_width; i_x--; )
  65         {
  66             while( (i_scale_count -= i_width) > 0 )
  67             {
  68                 *p_offset++ = 0;
  69             }
  70             *p_offset++ = 1;
  71             i_scale_count += i_pic_width;
  72         }
  73     }
  74     else /* if( i_pic_width - i_width < 0 ) */
  75     {   /* Prepare scaling array for horizontal reduction */
  76         int i_scale_count = i_pic_width;
  77
  78         *pb_hscale = 1;
  79         for( int i_x = i_pic_width; i_x--; )
  80         {
  81             *p_offset = 1;
  82             while( (i_scale_count -= i_pic_width) > 0 )
  83             {
  84                 *p_offset += 1;
  85             }
  86             p_offset++;
  87             i_scale_count += i_width;
  88         }
  89     }
  90
  91     /*
  92      * Set vertical scaling indicator
  93      */
  94     if( i_pic_height - i_height == 0 )
  95         *pi_vscale = 0;
  96     else if( i_pic_height - i_height > 0 )
  97         *pi_vscale = 1;
  98     else /* if( i_pic_height - i_height < 0 ) */
  99         *pi_vscale = -1;
 100 }
 101
 102 VLC_TARGET
 103 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 104 {
 105     filter_sys_t *p_sys = p_filter->p_sys;
 106
 107     /* We got this one from the old arguments */
 108     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 109     uint8_t  *p_y   = p_src->Y_PIXELS;
 110     uint8_t  *p_u   = p_src->U_PIXELS;
 111     uint8_t  *p_v   = p_src->V_PIXELS;
 112
 113     bool  b_hscale;                         /* horizontal scaling type */
 114     unsigned int i_vscale;                          /* vertical scaling type */
 115     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 116
 117     int         i_right_margin;
 118     int         i_rewind;
 119     int         i_scale_count;                       /* scale modulo counter */
 120     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 121     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 122
 123     /* Conversion buffer pointer */
 124     uint16_t *  p_buffer_start;
 125     uint16_t *  p_buffer;
 126
 127     /* Offset array pointer */
 128     int *       p_offset_start = p_sys->p_offset;
 129     int *       p_offset;
 130
 131     const int i_source_margin = p_src->p[0].i_pitch
 132                                  - p_src->p[0].i_visible_pitch
 133                                  - p_filter->fmt_in.video.i_x_offset;
 134     const int i_source_margin_c = p_src->p[1].i_pitch
 135                                  - p_src->p[1].i_visible_pitch
 136                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 137
 138     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 139
 140     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 141      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 142      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 143     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 144                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 145                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 146                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 147                &b_hscale, &i_vscale, p_offset_start );
 148
 149     if(b_hscale &&
 150        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
 151                       p_filter->fmt_in.video.i_x_offset +
 152                       p_filter->fmt_in.video.i_visible_width,
 153                       p_sys->i_bytespp))
 154         return;
 155     else p_buffer_start = (uint16_t*)p_sys->p_buffer;
 156
 157     /*
 158      * Perform conversion
 159      */
 160     i_scale_count = ( i_vscale == 1 ) ?
 161                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 162                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 163
 164 #ifdef SSE2
 165
 166     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 167
 168     /*
 169     ** SSE2 128 bits fetch/store instructions are faster
 170     ** if memory access is 16 bytes aligned
 171     */
 172
 173     p_buffer = b_hscale ? p_buffer_start : p_pic;
 174
 175     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 176                     p_dest->p->i_pitch|
 177                     ((intptr_t)p_y)|
 178                     ((intptr_t)p_buffer))) )
 179     {
 180         /* use faster SSE2 aligned fetch and store */
 181         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 182         {
 183             p_pic_start = p_pic;
 184
 185             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 186             {
 187                 SSE2_CALL (
 188                     SSE2_INIT_16_ALIGNED
 189                     SSE2_YUV_MUL
 190                     SSE2_YUV_ADD
 191                     SSE2_UNPACK_15_ALIGNED
 192                 );
 193                 p_y += 16;
 194                 p_u += 8;
 195                 p_v += 8;
 196                 p_buffer += 16;
 197             }
 198             /* Here we do some unaligned reads and duplicate conversions, but
 199              * at least we have all the pixels */
 200             if( i_rewind )
 201             {
 202                 p_y -= i_rewind;
 203                 p_u -= i_rewind >> 1;
 204                 p_v -= i_rewind >> 1;
 205                 p_buffer -= i_rewind;
 206
 207                 SSE2_CALL (
 208                     SSE2_INIT_16_UNALIGNED
 209                     SSE2_YUV_MUL
 210                     SSE2_YUV_ADD
 211                     SSE2_UNPACK_15_UNALIGNED
 212                 );
 213                 p_y += 16;
 214                 p_u += 8;
 215                 p_v += 8;
 216             }
 217             SCALE_WIDTH;
 218             SCALE_HEIGHT( 420, 2 );
 219
 220             p_y += i_source_margin;
 221             if( i_y % 2 )
 222             {
 223                 p_u += i_source_margin_c;
 224                 p_v += i_source_margin_c;
 225             }
 226             p_buffer = b_hscale ? p_buffer_start : p_pic;
 227         }
 228     }
 229     else
 230     {
 231         /* use slower SSE2 unaligned fetch and store */
 232         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 233         {
 234             p_pic_start = p_pic;
 235
 236             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 237             {
 238                 SSE2_CALL (
 239                     SSE2_INIT_16_UNALIGNED
 240                     SSE2_YUV_MUL
 241                     SSE2_YUV_ADD
 242                     SSE2_UNPACK_15_UNALIGNED
 243                 );
 244                 p_y += 16;
 245                 p_u += 8;
 246                 p_v += 8;
 247                 p_buffer += 16;
 248             }
 249             /* Here we do some unaligned reads and duplicate conversions, but
 250              * at least we have all the pixels */
 251             if( i_rewind )
 252             {
 253                 p_y -= i_rewind;
 254                 p_u -= i_rewind >> 1;
 255                 p_v -= i_rewind >> 1;
 256                 p_buffer -= i_rewind;
 257
 258                 SSE2_CALL (
 259                     SSE2_INIT_16_UNALIGNED
 260                     SSE2_YUV_MUL
 261                     SSE2_YUV_ADD
 262                     SSE2_UNPACK_15_UNALIGNED
 263                 );
 264                 p_y += 16;
 265                 p_u += 8;
 266                 p_v += 8;
 267             }
 268             SCALE_WIDTH;
 269             SCALE_HEIGHT( 420, 2 );
 270
 271             p_y += i_source_margin;
 272             if( i_y % 2 )
 273             {
 274                 p_u += i_source_margin_c;
 275                 p_v += i_source_margin_c;
 276             }
 277             p_buffer = b_hscale ? p_buffer_start : p_pic;
 278         }
 279     }
 280
 281     /* make sure all SSE2 stores are visible thereafter */
 282     SSE2_END;
 283
 284 #else /* SSE2 */
 285
 286     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 287
 288     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 289     {
 290         p_pic_start = p_pic;
 291         p_buffer = b_hscale ? p_buffer_start : p_pic;
 292
 293         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 294         {
 295             MMX_CALL (
 296                 MMX_INIT_16
 297                 MMX_YUV_MUL
 298                 MMX_YUV_ADD
 299                 MMX_UNPACK_15
 300             );
 301             p_y += 8;
 302             p_u += 4;
 303             p_v += 4;
 304             p_buffer += 8;
 305         }
 306
 307         /* Here we do some unaligned reads and duplicate conversions, but
 308          * at least we have all the pixels */
 309         if( i_rewind )
 310         {
 311             p_y -= i_rewind;
 312             p_u -= i_rewind >> 1;
 313             p_v -= i_rewind >> 1;
 314             p_buffer -= i_rewind;
 315
 316             MMX_CALL (
 317                 MMX_INIT_16
 318                 MMX_YUV_MUL
 319                 MMX_YUV_ADD
 320                 MMX_UNPACK_15
 321             );
 322             p_y += 8;
 323             p_u += 4;
 324             p_v += 4;
 325         }
 326         SCALE_WIDTH;
 327         SCALE_HEIGHT( 420, 2 );
 328
 329         p_y += i_source_margin;
 330         if( i_y % 2 )
 331         {
 332             p_u += i_source_margin_c;
 333             p_v += i_source_margin_c;
 334         }
 335     }
 336     /* re-enable FPU registers */
 337     MMX_END;
 338
 339 #endif /* SSE2 */
 340 }
 341
 342 VLC_TARGET
 343 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 344 {
 345     filter_sys_t *p_sys = p_filter->p_sys;
 346
 347     /* We got this one from the old arguments */
 348     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 349     uint8_t  *p_y   = p_src->Y_PIXELS;
 350     uint8_t  *p_u   = p_src->U_PIXELS;
 351     uint8_t  *p_v   = p_src->V_PIXELS;
 352
 353     bool  b_hscale;                         /* horizontal scaling type */
 354     unsigned int i_vscale;                          /* vertical scaling type */
 355     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 356
 357     int         i_right_margin;
 358     int         i_rewind;
 359     int         i_scale_count;                       /* scale modulo counter */
 360     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 361     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 362
 363     /* Conversion buffer pointer */
 364     uint16_t *  p_buffer_start;
 365     uint16_t *  p_buffer;
 366
 367     /* Offset array pointer */
 368     int *       p_offset_start = p_sys->p_offset;
 369     int *       p_offset;
 370
 371     const int i_source_margin = p_src->p[0].i_pitch
 372                                  - p_src->p[0].i_visible_pitch
 373                                  - p_filter->fmt_in.video.i_x_offset;
 374     const int i_source_margin_c = p_src->p[1].i_pitch
 375                                  - p_src->p[1].i_visible_pitch
 376                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 377
 378     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 379
 380     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 381      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 382      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 383     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 384                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 385                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 386                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 387                &b_hscale, &i_vscale, p_offset_start );
 388
 389     if(b_hscale &&
 390        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
 391                       p_filter->fmt_in.video.i_x_offset +
 392                       p_filter->fmt_in.video.i_visible_width,
 393                       p_sys->i_bytespp))
 394         return;
 395     else p_buffer_start = (uint16_t*)p_sys->p_buffer;
 396
 397     /*
 398      * Perform conversion
 399      */
 400     i_scale_count = ( i_vscale == 1 ) ?
 401                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 402                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 403
 404 #ifdef SSE2
 405
 406     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 407
 408     /*
 409     ** SSE2 128 bits fetch/store instructions are faster
 410     ** if memory access is 16 bytes aligned
 411     */
 412
 413     p_buffer = b_hscale ? p_buffer_start : p_pic;
 414
 415     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 416                     p_dest->p->i_pitch|
 417                     ((intptr_t)p_y)|
 418                     ((intptr_t)p_buffer))) )
 419     {
 420         /* use faster SSE2 aligned fetch and store */
 421         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 422         {
 423             p_pic_start = p_pic;
 424
 425             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 426             {
 427                 SSE2_CALL (
 428                     SSE2_INIT_16_ALIGNED
 429                     SSE2_YUV_MUL
 430                     SSE2_YUV_ADD
 431                     SSE2_UNPACK_16_ALIGNED
 432                 );
 433                 p_y += 16;
 434                 p_u += 8;
 435                 p_v += 8;
 436                 p_buffer += 16;
 437             }
 438             /* Here we do some unaligned reads and duplicate conversions, but
 439              * at least we have all the pixels */
 440             if( i_rewind )
 441             {
 442                 p_y -= i_rewind;
 443                 p_u -= i_rewind >> 1;
 444                 p_v -= i_rewind >> 1;
 445                 p_buffer -= i_rewind;
 446
 447                 SSE2_CALL (
 448                     SSE2_INIT_16_UNALIGNED
 449                     SSE2_YUV_MUL
 450                     SSE2_YUV_ADD
 451                     SSE2_UNPACK_16_UNALIGNED
 452                 );
 453                 p_y += 16;
 454                 p_u += 8;
 455                 p_v += 8;
 456             }
 457             SCALE_WIDTH;
 458             SCALE_HEIGHT( 420, 2 );
 459
 460             p_y += i_source_margin;
 461             if( i_y % 2 )
 462             {
 463                 p_u += i_source_margin_c;
 464                 p_v += i_source_margin_c;
 465             }
 466             p_buffer = b_hscale ? p_buffer_start : p_pic;
 467         }
 468     }
 469     else
 470     {
 471         /* use slower SSE2 unaligned fetch and store */
 472         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 473         {
 474             p_pic_start = p_pic;
 475
 476             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 477             {
 478                 SSE2_CALL(
 479                     SSE2_INIT_16_UNALIGNED
 480                     SSE2_YUV_MUL
 481                     SSE2_YUV_ADD
 482                     SSE2_UNPACK_16_UNALIGNED
 483                 );
 484                 p_y += 16;
 485                 p_u += 8;
 486                 p_v += 8;
 487                 p_buffer += 16;
 488             }
 489             /* Here we do some unaligned reads and duplicate conversions, but
 490              * at least we have all the pixels */
 491             if( i_rewind )
 492             {
 493                 p_y -= i_rewind;
 494                 p_u -= i_rewind >> 1;
 495                 p_v -= i_rewind >> 1;
 496                 p_buffer -= i_rewind;
 497
 498                 SSE2_CALL(
 499                     SSE2_INIT_16_UNALIGNED
 500                     SSE2_YUV_MUL
 501                     SSE2_YUV_ADD
 502                     SSE2_UNPACK_16_UNALIGNED
 503                 );
 504                 p_y += 16;
 505                 p_u += 8;
 506                 p_v += 8;
 507             }
 508             SCALE_WIDTH;
 509             SCALE_HEIGHT( 420, 2 );
 510
 511             p_y += i_source_margin;
 512             if( i_y % 2 )
 513             {
 514                 p_u += i_source_margin_c;
 515                 p_v += i_source_margin_c;
 516             }
 517             p_buffer = b_hscale ? p_buffer_start : p_pic;
 518         }
 519     }
 520
 521     /* make sure all SSE2 stores are visible thereafter */
 522     SSE2_END;
 523
 524 #else /* SSE2 */
 525
 526     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 527
 528     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 529     {
 530         p_pic_start = p_pic;
 531         p_buffer = b_hscale ? p_buffer_start : p_pic;
 532
 533         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 534         {
 535             MMX_CALL (
 536                 MMX_INIT_16
 537                 MMX_YUV_MUL
 538                 MMX_YUV_ADD
 539                 MMX_UNPACK_16
 540             );
 541             p_y += 8;
 542             p_u += 4;
 543             p_v += 4;
 544             p_buffer += 8;
 545         }
 546
 547         /* Here we do some unaligned reads and duplicate conversions, but
 548          * at least we have all the pixels */
 549         if( i_rewind )
 550         {
 551             p_y -= i_rewind;
 552             p_u -= i_rewind >> 1;
 553             p_v -= i_rewind >> 1;
 554             p_buffer -= i_rewind;
 555
 556             MMX_CALL (
 557                 MMX_INIT_16
 558                 MMX_YUV_MUL
 559                 MMX_YUV_ADD
 560                 MMX_UNPACK_16
 561             );
 562             p_y += 8;
 563             p_u += 4;
 564             p_v += 4;
 565         }
 566         SCALE_WIDTH;
 567         SCALE_HEIGHT( 420, 2 );
 568
 569         p_y += i_source_margin;
 570         if( i_y % 2 )
 571         {
 572             p_u += i_source_margin_c;
 573             p_v += i_source_margin_c;
 574         }
 575     }
 576     /* re-enable FPU registers */
 577     MMX_END;
 578
 579 #endif /* SSE2 */
 580 }
 581
 582 VLC_TARGET
 583 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 584                                             picture_t *p_dest )
 585 {
 586     filter_sys_t *p_sys = p_filter->p_sys;
 587
 588     /* We got this one from the old arguments */
 589     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 590     uint8_t  *p_y   = p_src->Y_PIXELS;
 591     uint8_t  *p_u   = p_src->U_PIXELS;
 592     uint8_t  *p_v   = p_src->V_PIXELS;
 593
 594     bool  b_hscale;                         /* horizontal scaling type */
 595     unsigned int i_vscale;                          /* vertical scaling type */
 596     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 597
 598     int         i_right_margin;
 599     int         i_rewind;
 600     int         i_scale_count;                       /* scale modulo counter */
 601     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 602     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 603     /* Conversion buffer pointer */
 604     uint32_t *  p_buffer_start;
 605     uint32_t *  p_buffer;
 606
 607     /* Offset array pointer */
 608     int *       p_offset_start = p_sys->p_offset;
 609     int *       p_offset;
 610
 611     const int i_source_margin = p_src->p[0].i_pitch
 612                                  - p_src->p[0].i_visible_pitch
 613                                  - p_filter->fmt_in.video.i_x_offset;
 614     const int i_source_margin_c = p_src->p[1].i_pitch
 615                                  - p_src->p[1].i_visible_pitch
 616                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 617
 618     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 619
 620     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 621      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 622      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 623     SetOffset( p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width,
 624                p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height,
 625                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 626                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 627                &b_hscale, &i_vscale, p_offset_start );
 628
 629     if(b_hscale &&
 630        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
 631                       p_filter->fmt_in.video.i_x_offset +
 632                       p_filter->fmt_in.video.i_visible_width,
 633                       p_sys->i_bytespp))
 634         return;
 635     else p_buffer_start = (uint32_t*)p_sys->p_buffer;
 636
 637     /*
 638      * Perform conversion
 639      */
 640     i_scale_count = ( i_vscale == 1 ) ?
 641                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 642                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 643
 644 #ifdef SSE2
 645
 646     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 647
 648     /*
 649     ** SSE2 128 bits fetch/store instructions are faster
 650     ** if memory access is 16 bytes aligned
 651     */
 652
 653     p_buffer = b_hscale ? p_buffer_start : p_pic;
 654
 655     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 656                     p_dest->p->i_pitch|
 657                     ((intptr_t)p_y)|
 658                     ((intptr_t)p_buffer))) )
 659     {
 660         /* use faster SSE2 aligned fetch and store */
 661         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 662         {
 663             p_pic_start = p_pic;
 664
 665             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 666             {
 667                 SSE2_CALL (
 668                     SSE2_INIT_32_ALIGNED
 669                     SSE2_YUV_MUL
 670                     SSE2_YUV_ADD
 671                     SSE2_UNPACK_32_ARGB_ALIGNED
 672                 );
 673                 p_y += 16;
 674                 p_u += 8;
 675                 p_v += 8;
 676                 p_buffer += 16;
 677             }
 678
 679             /* Here we do some unaligned reads and duplicate conversions, but
 680              * at least we have all the pixels */
 681             if( i_rewind )
 682             {
 683                 p_y -= i_rewind;
 684                 p_u -= i_rewind >> 1;
 685                 p_v -= i_rewind >> 1;
 686                 p_buffer -= i_rewind;
 687                 SSE2_CALL (
 688                     SSE2_INIT_32_UNALIGNED
 689                     SSE2_YUV_MUL
 690                     SSE2_YUV_ADD
 691                     SSE2_UNPACK_32_ARGB_UNALIGNED
 692                 );
 693                 p_y += 16;
 694                 p_u += 8;
 695                 p_v += 8;
 696             }
 697             SCALE_WIDTH;
 698             SCALE_HEIGHT( 420, 4 );
 699
 700             p_y += i_source_margin;
 701             if( i_y % 2 )
 702             {
 703                 p_u += i_source_margin_c;
 704                 p_v += i_source_margin_c;
 705             }
 706             p_buffer = b_hscale ? p_buffer_start : p_pic;
 707         }
 708     }
 709     else
 710     {
 711         /* use slower SSE2 unaligned fetch and store */
 712         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 713         {
 714             p_pic_start = p_pic;
 715
 716             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 717             {
 718                 SSE2_CALL (
 719                     SSE2_INIT_32_UNALIGNED
 720                     SSE2_YUV_MUL
 721                     SSE2_YUV_ADD
 722                     SSE2_UNPACK_32_ARGB_UNALIGNED
 723                 );
 724                 p_y += 16;
 725                 p_u += 8;
 726                 p_v += 8;
 727                 p_buffer += 16;
 728             }
 729
 730             /* Here we do some unaligned reads and duplicate conversions, but
 731              * at least we have all the pixels */
 732             if( i_rewind )
 733             {
 734                 p_y -= i_rewind;
 735                 p_u -= i_rewind >> 1;
 736                 p_v -= i_rewind >> 1;
 737                 p_buffer -= i_rewind;
 738                 SSE2_CALL (
 739                     SSE2_INIT_32_UNALIGNED
 740                     SSE2_YUV_MUL
 741                     SSE2_YUV_ADD
 742                     SSE2_UNPACK_32_ARGB_UNALIGNED
 743                 );
 744                 p_y += 16;
 745                 p_u += 8;
 746                 p_v += 8;
 747             }
 748             SCALE_WIDTH;
 749             SCALE_HEIGHT( 420, 4 );
 750
 751             p_y += i_source_margin;
 752             if( i_y % 2 )
 753             {
 754                 p_u += i_source_margin_c;
 755                 p_v += i_source_margin_c;
 756             }
 757             p_buffer = b_hscale ? p_buffer_start : p_pic;
 758         }
 759     }
 760
 761     /* make sure all SSE2 stores are visible thereafter */
 762     SSE2_END;
 763
 764 #else
 765
 766     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 767
 768     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 769     {
 770         p_pic_start = p_pic;
 771         p_buffer = b_hscale ? p_buffer_start : p_pic;
 772
 773         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 774         {
 775             MMX_CALL (
 776                 MMX_INIT_32
 777                 MMX_YUV_MUL
 778                 MMX_YUV_ADD
 779                 MMX_UNPACK_32_ARGB
 780             );
 781             p_y += 8;
 782             p_u += 4;
 783             p_v += 4;
 784             p_buffer += 8;
 785         }
 786
 787         /* Here we do some unaligned reads and duplicate conversions, but
 788          * at least we have all the pixels */
 789         if( i_rewind )
 790         {
 791             p_y -= i_rewind;
 792             p_u -= i_rewind >> 1;
 793             p_v -= i_rewind >> 1;
 794             p_buffer -= i_rewind;
 795             MMX_CALL (
 796                 MMX_INIT_32
 797                 MMX_YUV_MUL
 798                 MMX_YUV_ADD
 799                 MMX_UNPACK_32_ARGB
 800             );
 801             p_y += 8;
 802             p_u += 4;
 803             p_v += 4;
 804         }
 805         SCALE_WIDTH;
 806         SCALE_HEIGHT( 420, 4 );
 807
 808         p_y += i_source_margin;
 809         if( i_y % 2 )
 810         {
 811             p_u += i_source_margin_c;
 812             p_v += i_source_margin_c;
 813         }
 814     }
 815
 816     /* re-enable FPU registers */
 817     MMX_END;
 818
 819 #endif
 820 }
 821
 822 VLC_TARGET
 823 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 824 {
 825     filter_sys_t *p_sys = p_filter->p_sys;
 826
 827     /* We got this one from the old arguments */
 828     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 829     uint8_t  *p_y   = p_src->Y_PIXELS;
 830     uint8_t  *p_u   = p_src->U_PIXELS;
 831     uint8_t  *p_v   = p_src->V_PIXELS;
 832
 833     bool  b_hscale;                         /* horizontal scaling type */
 834     unsigned int i_vscale;                          /* vertical scaling type */
 835     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 836
 837     int         i_right_margin;
 838     int         i_rewind;
 839     int         i_scale_count;                       /* scale modulo counter */
 840     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 841     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 842     /* Conversion buffer pointer */
 843     uint32_t *  p_buffer_start;
 844     uint32_t *  p_buffer;
 845
 846     /* Offset array pointer */
 847     int *       p_offset_start = p_sys->p_offset;
 848     int *       p_offset;
 849
 850     const int i_source_margin = p_src->p[0].i_pitch
 851                                  - p_src->p[0].i_visible_pitch
 852                                  - p_filter->fmt_in.video.i_x_offset;
 853     const int i_source_margin_c = p_src->p[1].i_pitch
 854                                  - p_src->p[1].i_visible_pitch
 855                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 856
 857     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 858
 859     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 860      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 861      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 862     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 863                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 864                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 865                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 866                &b_hscale, &i_vscale, p_offset_start );
 867
 868     if(b_hscale &&
 869        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
 870                       p_filter->fmt_in.video.i_x_offset +
 871                       p_filter->fmt_in.video.i_visible_width,
 872                       p_sys->i_bytespp))
 873         return;
 874     else p_buffer_start = (uint32_t*)p_sys->p_buffer;
 875
 876     /*
 877      * Perform conversion
 878      */
 879     i_scale_count = ( i_vscale == 1 ) ?
 880                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 881                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 882
 883 #ifdef SSE2
 884
 885     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 886
 887     /*
 888     ** SSE2 128 bits fetch/store instructions are faster
 889     ** if memory access is 16 bytes aligned
 890     */
 891
 892     p_buffer = b_hscale ? p_buffer_start : p_pic;
 893
 894     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 895                     p_dest->p->i_pitch|
 896                     ((intptr_t)p_y)|
 897                     ((intptr_t)p_buffer))) )
 898     {
 899         /* use faster SSE2 aligned fetch and store */
 900         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 901         {
 902             p_pic_start = p_pic;
 903
 904             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 905             {
 906                 SSE2_CALL (
 907                     SSE2_INIT_32_ALIGNED
 908                     SSE2_YUV_MUL
 909                     SSE2_YUV_ADD
 910                     SSE2_UNPACK_32_RGBA_ALIGNED
 911                 );
 912                 p_y += 16;
 913                 p_u += 8;
 914                 p_v += 8;
 915                 p_buffer += 16;
 916             }
 917
 918             /* Here we do some unaligned reads and duplicate conversions, but
 919              * at least we have all the pixels */
 920             if( i_rewind )
 921             {
 922                 p_y -= i_rewind;
 923                 p_u -= i_rewind >> 1;
 924                 p_v -= i_rewind >> 1;
 925                 p_buffer -= i_rewind;
 926                 SSE2_CALL (
 927                     SSE2_INIT_32_UNALIGNED
 928                     SSE2_YUV_MUL
 929                     SSE2_YUV_ADD
 930                     SSE2_UNPACK_32_RGBA_UNALIGNED
 931                 );
 932                 p_y += 16;
 933                 p_u += 8;
 934                 p_v += 8;
 935             }
 936             SCALE_WIDTH;
 937             SCALE_HEIGHT( 420, 4 );
 938
 939             p_y += i_source_margin;
 940             if( i_y % 2 )
 941             {
 942                 p_u += i_source_margin_c;
 943                 p_v += i_source_margin_c;
 944             }
 945             p_buffer = b_hscale ? p_buffer_start : p_pic;
 946         }
 947     }
 948     else
 949     {
 950         /* use slower SSE2 unaligned fetch and store */
 951         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 952         {
 953             p_pic_start = p_pic;
 954
 955             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 956             {
 957                 SSE2_CALL (
 958                     SSE2_INIT_32_UNALIGNED
 959                     SSE2_YUV_MUL
 960                     SSE2_YUV_ADD
 961                     SSE2_UNPACK_32_RGBA_UNALIGNED
 962                 );
 963                 p_y += 16;
 964                 p_u += 8;
 965                 p_v += 8;
 966                 p_buffer += 16;
 967             }
 968
 969             /* Here we do some unaligned reads and duplicate conversions, but
 970              * at least we have all the pixels */
 971             if( i_rewind )
 972             {
 973                 p_y -= i_rewind;
 974                 p_u -= i_rewind >> 1;
 975                 p_v -= i_rewind >> 1;
 976                 p_buffer -= i_rewind;
 977                 SSE2_CALL (
 978                     SSE2_INIT_32_UNALIGNED
 979                     SSE2_YUV_MUL
 980                     SSE2_YUV_ADD
 981                     SSE2_UNPACK_32_RGBA_UNALIGNED
 982                 );
 983                 p_y += 16;
 984                 p_u += 8;
 985                 p_v += 8;
 986             }
 987             SCALE_WIDTH;
 988             SCALE_HEIGHT( 420, 4 );
 989
 990             p_y += i_source_margin;
 991             if( i_y % 2 )
 992             {
 993                 p_u += i_source_margin_c;
 994                 p_v += i_source_margin_c;
 995             }
 996             p_buffer = b_hscale ? p_buffer_start : p_pic;
 997         }
 998     }
 999
1000     /* make sure all SSE2 stores are visible thereafter */
1001     SSE2_END;
1002
1003 #else
1004
1005     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1006
1007     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1008     {
1009         p_pic_start = p_pic;
1010         p_buffer = b_hscale ? p_buffer_start : p_pic;
1011
1012         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1013         {
1014             MMX_CALL (
1015                 MMX_INIT_32
1016                 MMX_YUV_MUL
1017                 MMX_YUV_ADD
1018                 MMX_UNPACK_32_RGBA
1019             );
1020             p_y += 8;
1021             p_u += 4;
1022             p_v += 4;
1023             p_buffer += 8;
1024         }
1025
1026         /* Here we do some unaligned reads and duplicate conversions, but
1027          * at least we have all the pixels */
1028         if( i_rewind )
1029         {
1030             p_y -= i_rewind;
1031             p_u -= i_rewind >> 1;
1032             p_v -= i_rewind >> 1;
1033             p_buffer -= i_rewind;
1034             MMX_CALL (
1035                 MMX_INIT_32
1036                 MMX_YUV_MUL
1037                 MMX_YUV_ADD
1038                 MMX_UNPACK_32_RGBA
1039             );
1040             p_y += 8;
1041             p_u += 4;
1042             p_v += 4;
1043         }
1044         SCALE_WIDTH;
1045         SCALE_HEIGHT( 420, 4 );
1046
1047         p_y += i_source_margin;
1048         if( i_y % 2 )
1049         {
1050             p_u += i_source_margin_c;
1051             p_v += i_source_margin_c;
1052         }
1053     }
1054
1055     /* re-enable FPU registers */
1056     MMX_END;
1057
1058 #endif
1059 }
1060
1061 VLC_TARGET
1062 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1063 {
1064     filter_sys_t *p_sys = p_filter->p_sys;
1065
1066     /* We got this one from the old arguments */
1067     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1068     uint8_t  *p_y   = p_src->Y_PIXELS;
1069     uint8_t  *p_u   = p_src->U_PIXELS;
1070     uint8_t  *p_v   = p_src->V_PIXELS;
1071
1072     bool  b_hscale;                         /* horizontal scaling type */
1073     unsigned int i_vscale;                          /* vertical scaling type */
1074     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1075
1076     int         i_right_margin;
1077     int         i_rewind;
1078     int         i_scale_count;                       /* scale modulo counter */
1079     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1080     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1081     /* Conversion buffer pointer */
1082     uint32_t *  p_buffer_start;
1083     uint32_t *  p_buffer;
1084
1085     /* Offset array pointer */
1086     int *       p_offset_start = p_sys->p_offset;
1087     int *       p_offset;
1088
1089     const int i_source_margin = p_src->p[0].i_pitch
1090                                  - p_src->p[0].i_visible_pitch
1091                                  - p_filter->fmt_in.video.i_x_offset;
1092     const int i_source_margin_c = p_src->p[1].i_pitch
1093                                  - p_src->p[1].i_visible_pitch
1094                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
1095
1096     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1097
1098     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1099      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1100      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1101     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1102                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1103                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1104                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1105                &b_hscale, &i_vscale, p_offset_start );
1106
1107     if(b_hscale &&
1108        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
1109                       p_filter->fmt_in.video.i_x_offset +
1110                       p_filter->fmt_in.video.i_visible_width,
1111                       p_sys->i_bytespp))
1112         return;
1113     else p_buffer_start = (uint32_t*)p_sys->p_buffer;
1114
1115     /*
1116      * Perform conversion
1117      */
1118     i_scale_count = ( i_vscale == 1 ) ?
1119                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1120                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1121
1122 #ifdef SSE2
1123
1124     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1125
1126     /*
1127     ** SSE2 128 bits fetch/store instructions are faster
1128     ** if memory access is 16 bytes aligned
1129     */
1130
1131     p_buffer = b_hscale ? p_buffer_start : p_pic;
1132
1133     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1134                     p_dest->p->i_pitch|
1135                     ((intptr_t)p_y)|
1136                     ((intptr_t)p_buffer))) )
1137     {
1138         /* use faster SSE2 aligned fetch and store */
1139         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1140         {
1141             p_pic_start = p_pic;
1142
1143             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1144             {
1145                 SSE2_CALL (
1146                     SSE2_INIT_32_ALIGNED
1147                     SSE2_YUV_MUL
1148                     SSE2_YUV_ADD
1149                     SSE2_UNPACK_32_BGRA_ALIGNED
1150                 );
1151                 p_y += 16;
1152                 p_u += 8;
1153                 p_v += 8;
1154                 p_buffer += 16;
1155             }
1156
1157             /* Here we do some unaligned reads and duplicate conversions, but
1158              * at least we have all the pixels */
1159             if( i_rewind )
1160             {
1161                 p_y -= i_rewind;
1162                 p_u -= i_rewind >> 1;
1163                 p_v -= i_rewind >> 1;
1164                 p_buffer -= i_rewind;
1165                 SSE2_CALL (
1166                     SSE2_INIT_32_UNALIGNED
1167                     SSE2_YUV_MUL
1168                     SSE2_YUV_ADD
1169                     SSE2_UNPACK_32_BGRA_UNALIGNED
1170                 );
1171                 p_y += 16;
1172                 p_u += 8;
1173                 p_v += 8;
1174             }
1175             SCALE_WIDTH;
1176             SCALE_HEIGHT( 420, 4 );
1177
1178             p_y += i_source_margin;
1179             if( i_y % 2 )
1180             {
1181                 p_u += i_source_margin_c;
1182                 p_v += i_source_margin_c;
1183             }
1184             p_buffer = b_hscale ? p_buffer_start : p_pic;
1185         }
1186     }
1187     else
1188     {
1189         /* use slower SSE2 unaligned fetch and store */
1190         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1191         {
1192             p_pic_start = p_pic;
1193
1194             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1195             {
1196                 SSE2_CALL (
1197                     SSE2_INIT_32_UNALIGNED
1198                     SSE2_YUV_MUL
1199                     SSE2_YUV_ADD
1200                     SSE2_UNPACK_32_BGRA_UNALIGNED
1201                 );
1202                 p_y += 16;
1203                 p_u += 8;
1204                 p_v += 8;
1205                 p_buffer += 16;
1206             }
1207
1208             /* Here we do some unaligned reads and duplicate conversions, but
1209              * at least we have all the pixels */
1210             if( i_rewind )
1211             {
1212                 p_y -= i_rewind;
1213                 p_u -= i_rewind >> 1;
1214                 p_v -= i_rewind >> 1;
1215                 p_buffer -= i_rewind;
1216                 SSE2_CALL (
1217                     SSE2_INIT_32_UNALIGNED
1218                     SSE2_YUV_MUL
1219                     SSE2_YUV_ADD
1220                     SSE2_UNPACK_32_BGRA_UNALIGNED
1221                 );
1222                 p_y += 16;
1223                 p_u += 8;
1224                 p_v += 8;
1225             }
1226             SCALE_WIDTH;
1227             SCALE_HEIGHT( 420, 4 );
1228
1229             p_y += i_source_margin;
1230             if( i_y % 2 )
1231             {
1232                 p_u += i_source_margin_c;
1233                 p_v += i_source_margin_c;
1234             }
1235             p_buffer = b_hscale ? p_buffer_start : p_pic;
1236         }
1237     }
1238
1239     /* make sure all SSE2 stores are visible thereafter */
1240     SSE2_END;
1241
1242 #else
1243
1244     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1245
1246     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1247     {
1248         p_pic_start = p_pic;
1249         p_buffer = b_hscale ? p_buffer_start : p_pic;
1250
1251         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1252         {
1253             MMX_CALL (
1254                 MMX_INIT_32
1255                 MMX_YUV_MUL
1256                 MMX_YUV_ADD
1257                 MMX_UNPACK_32_BGRA
1258             );
1259             p_y += 8;
1260             p_u += 4;
1261             p_v += 4;
1262             p_buffer += 8;
1263         }
1264
1265         /* Here we do some unaligned reads and duplicate conversions, but
1266          * at least we have all the pixels */
1267         if( i_rewind )
1268         {
1269             p_y -= i_rewind;
1270             p_u -= i_rewind >> 1;
1271             p_v -= i_rewind >> 1;
1272             p_buffer -= i_rewind;
1273             MMX_CALL (
1274                 MMX_INIT_32
1275                 MMX_YUV_MUL
1276                 MMX_YUV_ADD
1277                 MMX_UNPACK_32_BGRA
1278             );
1279             p_y += 8;
1280             p_u += 4;
1281             p_v += 4;
1282         }
1283         SCALE_WIDTH;
1284         SCALE_HEIGHT( 420, 4 );
1285
1286         p_y += i_source_margin;
1287         if( i_y % 2 )
1288         {
1289             p_u += i_source_margin_c;
1290             p_v += i_source_margin_c;
1291         }
1292     }
1293
1294     /* re-enable FPU registers */
1295     MMX_END;
1296
1297 #endif
1298 }
1299
1300 VLC_TARGET
1301 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1302 {
1303     filter_sys_t *p_sys = p_filter->p_sys;
1304
1305     /* We got this one from the old arguments */
1306     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1307     uint8_t  *p_y   = p_src->Y_PIXELS;
1308     uint8_t  *p_u   = p_src->U_PIXELS;
1309     uint8_t  *p_v   = p_src->V_PIXELS;
1310
1311     bool  b_hscale;                         /* horizontal scaling type */
1312     unsigned int i_vscale;                          /* vertical scaling type */
1313     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1314
1315     int         i_right_margin;
1316     int         i_rewind;
1317     int         i_scale_count;                       /* scale modulo counter */
1318     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1319     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1320     /* Conversion buffer pointer */
1321     uint32_t *  p_buffer_start;
1322     uint32_t *  p_buffer;
1323
1324     /* Offset array pointer */
1325     int *       p_offset_start = p_sys->p_offset;
1326     int *       p_offset;
1327
1328     const int i_source_margin = p_src->p[0].i_pitch
1329                                  - p_src->p[0].i_visible_pitch
1330                                  - p_filter->fmt_in.video.i_x_offset;
1331     const int i_source_margin_c = p_src->p[1].i_pitch
1332                                  - p_src->p[1].i_visible_pitch
1333                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
1334
1335     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1336
1337     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1338      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1339      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1340     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1341                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1342                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1343                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1344                &b_hscale, &i_vscale, p_offset_start );
1345
1346     if(b_hscale &&
1347        AllocateOrGrow(&p_sys->p_buffer, &p_sys->i_buffer_size,
1348                       p_filter->fmt_in.video.i_x_offset +
1349                       p_filter->fmt_in.video.i_visible_width,
1350                       p_sys->i_bytespp))
1351         return;
1352     else p_buffer_start = (uint32_t*)p_sys->p_buffer;
1353
1354     /*
1355      * Perform conversion
1356      */
1357     i_scale_count = ( i_vscale == 1 ) ?
1358                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1359                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1360
1361 #ifdef SSE2
1362
1363     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1364
1365     /*
1366     ** SSE2 128 bits fetch/store instructions are faster
1367     ** if memory access is 16 bytes aligned
1368     */
1369
1370     p_buffer = b_hscale ? p_buffer_start : p_pic;
1371
1372     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1373                     p_dest->p->i_pitch|
1374                     ((intptr_t)p_y)|
1375                     ((intptr_t)p_buffer))) )
1376     {
1377         /* use faster SSE2 aligned fetch and store */
1378         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1379         {
1380             p_pic_start = p_pic;
1381
1382             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1383             {
1384                 SSE2_CALL (
1385                     SSE2_INIT_32_ALIGNED
1386                     SSE2_YUV_MUL
1387                     SSE2_YUV_ADD
1388                     SSE2_UNPACK_32_ABGR_ALIGNED
1389                 );
1390                 p_y += 16;
1391                 p_u += 8;
1392                 p_v += 8;
1393                 p_buffer += 16;
1394             }
1395
1396             /* Here we do some unaligned reads and duplicate conversions, but
1397              * at least we have all the pixels */
1398             if( i_rewind )
1399             {
1400                 p_y -= i_rewind;
1401                 p_u -= i_rewind >> 1;
1402                 p_v -= i_rewind >> 1;
1403                 p_buffer -= i_rewind;
1404                 SSE2_CALL (
1405                     SSE2_INIT_32_UNALIGNED
1406                     SSE2_YUV_MUL
1407                     SSE2_YUV_ADD
1408                     SSE2_UNPACK_32_ABGR_UNALIGNED
1409                 );
1410                 p_y += 16;
1411                 p_u += 8;
1412                 p_v += 8;
1413             }
1414             SCALE_WIDTH;
1415             SCALE_HEIGHT( 420, 4 );
1416
1417             p_y += i_source_margin;
1418             if( i_y % 2 )
1419             {
1420                 p_u += i_source_margin_c;
1421                 p_v += i_source_margin_c;
1422             }
1423             p_buffer = b_hscale ? p_buffer_start : p_pic;
1424         }
1425     }
1426     else
1427     {
1428         /* use slower SSE2 unaligned fetch and store */
1429         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1430         {
1431             p_pic_start = p_pic;
1432
1433             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1434             {
1435                 SSE2_CALL (
1436                     SSE2_INIT_32_UNALIGNED
1437                     SSE2_YUV_MUL
1438                     SSE2_YUV_ADD
1439                     SSE2_UNPACK_32_ABGR_UNALIGNED
1440                 );
1441                 p_y += 16;
1442                 p_u += 8;
1443                 p_v += 8;
1444                 p_buffer += 16;
1445             }
1446
1447             /* Here we do some unaligned reads and duplicate conversions, but
1448              * at least we have all the pixels */
1449             if( i_rewind )
1450             {
1451                 p_y -= i_rewind;
1452                 p_u -= i_rewind >> 1;
1453                 p_v -= i_rewind >> 1;
1454                 p_buffer -= i_rewind;
1455                 SSE2_CALL (
1456                     SSE2_INIT_32_UNALIGNED
1457                     SSE2_YUV_MUL
1458                     SSE2_YUV_ADD
1459                     SSE2_UNPACK_32_ABGR_UNALIGNED
1460                 );
1461                 p_y += 16;
1462                 p_u += 8;
1463                 p_v += 8;
1464             }
1465             SCALE_WIDTH;
1466             SCALE_HEIGHT( 420, 4 );
1467
1468             p_y += i_source_margin;
1469             if( i_y % 2 )
1470             {
1471                 p_u += i_source_margin_c;
1472                 p_v += i_source_margin_c;
1473             }
1474             p_buffer = b_hscale ? p_buffer_start : p_pic;
1475         }
1476     }
1477
1478     /* make sure all SSE2 stores are visible thereafter */
1479     SSE2_END;
1480
1481 #else
1482
1483     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1484
1485     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1486     {
1487         p_pic_start = p_pic;
1488         p_buffer = b_hscale ? p_buffer_start : p_pic;
1489
1490         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1491         {
1492             MMX_CALL (
1493                 MMX_INIT_32
1494                 MMX_YUV_MUL
1495                 MMX_YUV_ADD
1496                 MMX_UNPACK_32_ABGR
1497             );
1498             p_y += 8;
1499             p_u += 4;
1500             p_v += 4;
1501             p_buffer += 8;
1502         }
1503
1504         /* Here we do some unaligned reads and duplicate conversions, but
1505          * at least we have all the pixels */
1506         if( i_rewind )
1507         {
1508             p_y -= i_rewind;
1509             p_u -= i_rewind >> 1;
1510             p_v -= i_rewind >> 1;
1511             p_buffer -= i_rewind;
1512             MMX_CALL (
1513                 MMX_INIT_32
1514                 MMX_YUV_MUL
1515                 MMX_YUV_ADD
1516                 MMX_UNPACK_32_ABGR
1517             );
1518             p_y += 8;
1519             p_u += 4;
1520             p_v += 4;
1521         }
1522         SCALE_WIDTH;
1523         SCALE_HEIGHT( 420, 4 );
1524
1525         p_y += i_source_margin;
1526         if( i_y % 2 )
1527         {
1528             p_u += i_source_margin_c;
1529             p_v += i_source_margin_c;
1530         }
1531     }
1532
1533     /* re-enable FPU registers */
1534     MMX_END;
1535
1536 #endif
1537 }