modules/video_chroma/i420_rgb16_x86.c

   1 /*****************************************************************************
   2  * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
   3  *****************************************************************************
   4  * Copyright (C) 2000 VLC authors and VideoLAN
   5  * $Id$
   6  *
   7  * Authors: Samuel Hocevar <sam@zoy.org>
   8  *          Damien Fouilleul <damienf@videolan.org>
   9  *
  10  * This program is free software; you can redistribute it and/or modify it
  11  * under the terms of the GNU Lesser General Public License as published by
  12  * the Free Software Foundation; either version 2.1 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  18  * GNU Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public License
  21  * along with this program; if not, write to the Free Software Foundation,
  22  * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
  23  *****************************************************************************/
  24
  25 #ifdef HAVE_CONFIG_H
  26 # include "config.h"
  27 #endif
  28
  29 #include <vlc_common.h>
  30 #include <vlc_filter.h>
  31 #include <vlc_picture.h>
  32 #include <vlc_cpu.h>
  33
  34 #include "i420_rgb.h"
  35 #ifdef SSE2
  36 # include "i420_rgb_sse2.h"
  37 # define VLC_TARGET VLC_SSE
  38 #else
  39 # include "i420_rgb_mmx.h"
  40 # define VLC_TARGET VLC_MMX
  41 #endif
  42
  43 /*****************************************************************************
  44  * SetOffset: build offset array for conversion functions
  45  *****************************************************************************
  46  * This function will build an offset array used in later conversion functions.
  47  * It will also set horizontal and vertical scaling indicators.
  48  *****************************************************************************/
  49 static void SetOffset( int i_width, int i_height, int i_pic_width,
  50                        int i_pic_height, bool *pb_hscale,
  51                        unsigned int *pi_vscale, int *p_offset )
  52 {
  53     /*
  54      * Prepare horizontal offset array
  55      */
  56     if( i_pic_width - i_width == 0 )
  57     {   /* No horizontal scaling: YUV conversion is done directly to picture */
  58         *pb_hscale = 0;
  59     }
  60     else if( i_pic_width - i_width > 0 )
  61     {   /* Prepare scaling array for horizontal extension */
  62         int i_scale_count = i_pic_width;
  63
  64         *pb_hscale = 1;
  65         for( int i_x = i_width; i_x--; )
  66         {
  67             while( (i_scale_count -= i_width) > 0 )
  68             {
  69                 *p_offset++ = 0;
  70             }
  71             *p_offset++ = 1;
  72             i_scale_count += i_pic_width;
  73         }
  74     }
  75     else /* if( i_pic_width - i_width < 0 ) */
  76     {   /* Prepare scaling array for horizontal reduction */
  77         int i_scale_count = i_pic_width;
  78
  79         *pb_hscale = 1;
  80         for( int i_x = i_pic_width; i_x--; )
  81         {
  82             *p_offset = 1;
  83             while( (i_scale_count -= i_pic_width) > 0 )
  84             {
  85                 *p_offset += 1;
  86             }
  87             p_offset++;
  88             i_scale_count += i_width;
  89         }
  90     }
  91
  92     /*
  93      * Set vertical scaling indicator
  94      */
  95     if( i_pic_height - i_height == 0 )
  96         *pi_vscale = 0;
  97     else if( i_pic_height - i_height > 0 )
  98         *pi_vscale = 1;
  99     else /* if( i_pic_height - i_height < 0 ) */
 100         *pi_vscale = -1;
 101 }
 102
 103 VLC_TARGET
 104 void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 105 {
 106     /* We got this one from the old arguments */
 107     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 108     uint8_t  *p_y   = p_src->Y_PIXELS;
 109     uint8_t  *p_u   = p_src->U_PIXELS;
 110     uint8_t  *p_v   = p_src->V_PIXELS;
 111
 112     bool  b_hscale;                         /* horizontal scaling type */
 113     unsigned int i_vscale;                          /* vertical scaling type */
 114     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 115
 116     int         i_right_margin;
 117     int         i_rewind;
 118     int         i_scale_count;                       /* scale modulo counter */
 119     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 120     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 121
 122     /* Conversion buffer pointer */
 123     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 124     uint16_t *  p_buffer;
 125
 126     /* Offset array pointer */
 127     int *       p_offset_start = p_filter->p_sys->p_offset;
 128     int *       p_offset;
 129
 130     const int i_source_margin = p_src->p[0].i_pitch
 131                                  - p_src->p[0].i_visible_pitch
 132                                  - p_filter->fmt_in.video.i_x_offset;
 133     const int i_source_margin_c = p_src->p[1].i_pitch
 134                                  - p_src->p[1].i_visible_pitch
 135                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 136
 137     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 138
 139     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 140      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 141      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 142     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 143                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 144                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 145                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 146                &b_hscale, &i_vscale, p_offset_start );
 147
 148
 149     /*
 150      * Perform conversion
 151      */
 152     i_scale_count = ( i_vscale == 1 ) ?
 153                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 154                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 155
 156 #ifdef SSE2
 157
 158     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 159
 160     /*
 161     ** SSE2 128 bits fetch/store instructions are faster
 162     ** if memory access is 16 bytes aligned
 163     */
 164
 165     p_buffer = b_hscale ? p_buffer_start : p_pic;
 166     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 167                     p_dest->p->i_pitch|
 168                     ((intptr_t)p_y)|
 169                     ((intptr_t)p_buffer))) )
 170     {
 171         /* use faster SSE2 aligned fetch and store */
 172         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 173         {
 174             p_pic_start = p_pic;
 175
 176             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 177             {
 178                 SSE2_CALL (
 179                     SSE2_INIT_16_ALIGNED
 180                     SSE2_YUV_MUL
 181                     SSE2_YUV_ADD
 182                     SSE2_UNPACK_15_ALIGNED
 183                 );
 184                 p_y += 16;
 185                 p_u += 8;
 186                 p_v += 8;
 187                 p_buffer += 16;
 188             }
 189             /* Here we do some unaligned reads and duplicate conversions, but
 190              * at least we have all the pixels */
 191             if( i_rewind )
 192             {
 193                 p_y -= i_rewind;
 194                 p_u -= i_rewind >> 1;
 195                 p_v -= i_rewind >> 1;
 196                 p_buffer -= i_rewind;
 197
 198                 SSE2_CALL (
 199                     SSE2_INIT_16_UNALIGNED
 200                     SSE2_YUV_MUL
 201                     SSE2_YUV_ADD
 202                     SSE2_UNPACK_15_UNALIGNED
 203                 );
 204                 p_y += 16;
 205                 p_u += 8;
 206                 p_v += 8;
 207             }
 208             SCALE_WIDTH;
 209             SCALE_HEIGHT( 420, 2 );
 210
 211             p_y += i_source_margin;
 212             if( i_y % 2 )
 213             {
 214                 p_u += i_source_margin_c;
 215                 p_v += i_source_margin_c;
 216             }
 217             p_buffer = b_hscale ? p_buffer_start : p_pic;
 218         }
 219     }
 220     else
 221     {
 222         /* use slower SSE2 unaligned fetch and store */
 223         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 224         {
 225             p_pic_start = p_pic;
 226             p_buffer = b_hscale ? p_buffer_start : p_pic;
 227
 228             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 229             {
 230                 SSE2_CALL (
 231                     SSE2_INIT_16_UNALIGNED
 232                     SSE2_YUV_MUL
 233                     SSE2_YUV_ADD
 234                     SSE2_UNPACK_15_UNALIGNED
 235                 );
 236                 p_y += 16;
 237                 p_u += 8;
 238                 p_v += 8;
 239                 p_buffer += 16;
 240             }
 241             /* Here we do some unaligned reads and duplicate conversions, but
 242              * at least we have all the pixels */
 243             if( i_rewind )
 244             {
 245                 p_y -= i_rewind;
 246                 p_u -= i_rewind >> 1;
 247                 p_v -= i_rewind >> 1;
 248                 p_buffer -= i_rewind;
 249
 250                 SSE2_CALL (
 251                     SSE2_INIT_16_UNALIGNED
 252                     SSE2_YUV_MUL
 253                     SSE2_YUV_ADD
 254                     SSE2_UNPACK_15_UNALIGNED
 255                 );
 256                 p_y += 16;
 257                 p_u += 8;
 258                 p_v += 8;
 259             }
 260             SCALE_WIDTH;
 261             SCALE_HEIGHT( 420, 2 );
 262
 263             p_y += i_source_margin;
 264             if( i_y % 2 )
 265             {
 266                 p_u += i_source_margin_c;
 267                 p_v += i_source_margin_c;
 268             }
 269             p_buffer = b_hscale ? p_buffer_start : p_pic;
 270         }
 271     }
 272
 273     /* make sure all SSE2 stores are visible thereafter */
 274     SSE2_END;
 275
 276 #else /* SSE2 */
 277
 278     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 279
 280     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 281     {
 282         p_pic_start = p_pic;
 283         p_buffer = b_hscale ? p_buffer_start : p_pic;
 284
 285         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 286         {
 287             MMX_CALL (
 288                 MMX_INIT_16
 289                 MMX_YUV_MUL
 290                 MMX_YUV_ADD
 291                 MMX_UNPACK_15
 292             );
 293             p_y += 8;
 294             p_u += 4;
 295             p_v += 4;
 296             p_buffer += 8;
 297         }
 298
 299         /* Here we do some unaligned reads and duplicate conversions, but
 300          * at least we have all the pixels */
 301         if( i_rewind )
 302         {
 303             p_y -= i_rewind;
 304             p_u -= i_rewind >> 1;
 305             p_v -= i_rewind >> 1;
 306             p_buffer -= i_rewind;
 307
 308             MMX_CALL (
 309                 MMX_INIT_16
 310                 MMX_YUV_MUL
 311                 MMX_YUV_ADD
 312                 MMX_UNPACK_15
 313             );
 314             p_y += 8;
 315             p_u += 4;
 316             p_v += 4;
 317             p_buffer += 8;
 318         }
 319         SCALE_WIDTH;
 320         SCALE_HEIGHT( 420, 2 );
 321
 322         p_y += i_source_margin;
 323         if( i_y % 2 )
 324         {
 325             p_u += i_source_margin_c;
 326             p_v += i_source_margin_c;
 327         }
 328     }
 329     /* re-enable FPU registers */
 330     MMX_END;
 331
 332 #endif /* SSE2 */
 333 }
 334
 335 VLC_TARGET
 336 void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 337 {
 338     /* We got this one from the old arguments */
 339     uint16_t *p_pic = (uint16_t*)p_dest->p->p_pixels;
 340     uint8_t  *p_y   = p_src->Y_PIXELS;
 341     uint8_t  *p_u   = p_src->U_PIXELS;
 342     uint8_t  *p_v   = p_src->V_PIXELS;
 343
 344     bool  b_hscale;                         /* horizontal scaling type */
 345     unsigned int i_vscale;                          /* vertical scaling type */
 346     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 347
 348     int         i_right_margin;
 349     int         i_rewind;
 350     int         i_scale_count;                       /* scale modulo counter */
 351     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 352     uint16_t *  p_pic_start;       /* beginning of the current line for copy */
 353
 354     /* Conversion buffer pointer */
 355     uint16_t *  p_buffer_start = (uint16_t*)p_filter->p_sys->p_buffer;
 356     uint16_t *  p_buffer;
 357
 358     /* Offset array pointer */
 359     int *       p_offset_start = p_filter->p_sys->p_offset;
 360     int *       p_offset;
 361
 362     const int i_source_margin = p_src->p[0].i_pitch
 363                                  - p_src->p[0].i_visible_pitch
 364                                  - p_filter->fmt_in.video.i_x_offset;
 365     const int i_source_margin_c = p_src->p[1].i_pitch
 366                                  - p_src->p[1].i_visible_pitch
 367                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 368
 369     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 370
 371     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 372      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 373      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 374     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 375                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 376                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 377                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 378                &b_hscale, &i_vscale, p_offset_start );
 379
 380
 381     /*
 382      * Perform conversion
 383      */
 384     i_scale_count = ( i_vscale == 1 ) ?
 385                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 386                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 387
 388 #ifdef SSE2
 389
 390     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 391
 392     /*
 393     ** SSE2 128 bits fetch/store instructions are faster
 394     ** if memory access is 16 bytes aligned
 395     */
 396
 397     p_buffer = b_hscale ? p_buffer_start : p_pic;
 398     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 399                     p_dest->p->i_pitch|
 400                     ((intptr_t)p_y)|
 401                     ((intptr_t)p_buffer))) )
 402     {
 403         /* use faster SSE2 aligned fetch and store */
 404         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 405         {
 406             p_pic_start = p_pic;
 407
 408             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 409             {
 410                 SSE2_CALL (
 411                     SSE2_INIT_16_ALIGNED
 412                     SSE2_YUV_MUL
 413                     SSE2_YUV_ADD
 414                     SSE2_UNPACK_16_ALIGNED
 415                 );
 416                 p_y += 16;
 417                 p_u += 8;
 418                 p_v += 8;
 419                 p_buffer += 16;
 420             }
 421             /* Here we do some unaligned reads and duplicate conversions, but
 422              * at least we have all the pixels */
 423             if( i_rewind )
 424             {
 425                 p_y -= i_rewind;
 426                 p_u -= i_rewind >> 1;
 427                 p_v -= i_rewind >> 1;
 428                 p_buffer -= i_rewind;
 429
 430                 SSE2_CALL (
 431                     SSE2_INIT_16_UNALIGNED
 432                     SSE2_YUV_MUL
 433                     SSE2_YUV_ADD
 434                     SSE2_UNPACK_16_UNALIGNED
 435                 );
 436                 p_y += 16;
 437                 p_u += 8;
 438                 p_v += 8;
 439             }
 440             SCALE_WIDTH;
 441             SCALE_HEIGHT( 420, 2 );
 442
 443             p_y += i_source_margin;
 444             if( i_y % 2 )
 445             {
 446                 p_u += i_source_margin_c;
 447                 p_v += i_source_margin_c;
 448             }
 449             p_buffer = b_hscale ? p_buffer_start : p_pic;
 450         }
 451     }
 452     else
 453     {
 454         /* use slower SSE2 unaligned fetch and store */
 455         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 456         {
 457             p_pic_start = p_pic;
 458             p_buffer = b_hscale ? p_buffer_start : p_pic;
 459
 460             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/16; i_x--; )
 461             {
 462                 SSE2_CALL(
 463                     SSE2_INIT_16_UNALIGNED
 464                     SSE2_YUV_MUL
 465                     SSE2_YUV_ADD
 466                     SSE2_UNPACK_16_UNALIGNED
 467                 );
 468                 p_y += 16;
 469                 p_u += 8;
 470                 p_v += 8;
 471                 p_buffer += 16;
 472             }
 473             /* Here we do some unaligned reads and duplicate conversions, but
 474              * at least we have all the pixels */
 475             if( i_rewind )
 476             {
 477                 p_y -= i_rewind;
 478                 p_u -= i_rewind >> 1;
 479                 p_v -= i_rewind >> 1;
 480                 p_buffer -= i_rewind;
 481
 482                 SSE2_CALL(
 483                     SSE2_INIT_16_UNALIGNED
 484                     SSE2_YUV_MUL
 485                     SSE2_YUV_ADD
 486                     SSE2_UNPACK_16_UNALIGNED
 487                 );
 488                 p_y += 16;
 489                 p_u += 8;
 490                 p_v += 8;
 491             }
 492             SCALE_WIDTH;
 493             SCALE_HEIGHT( 420, 2 );
 494
 495             p_y += i_source_margin;
 496             if( i_y % 2 )
 497             {
 498                 p_u += i_source_margin_c;
 499                 p_v += i_source_margin_c;
 500             }
 501             p_buffer = b_hscale ? p_buffer_start : p_pic;
 502         }
 503     }
 504
 505     /* make sure all SSE2 stores are visible thereafter */
 506     SSE2_END;
 507
 508 #else /* SSE2 */
 509
 510     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 511
 512     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 513     {
 514         p_pic_start = p_pic;
 515         p_buffer = b_hscale ? p_buffer_start : p_pic;
 516
 517         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 518         {
 519             MMX_CALL (
 520                 MMX_INIT_16
 521                 MMX_YUV_MUL
 522                 MMX_YUV_ADD
 523                 MMX_UNPACK_16
 524             );
 525             p_y += 8;
 526             p_u += 4;
 527             p_v += 4;
 528             p_buffer += 8;
 529         }
 530
 531         /* Here we do some unaligned reads and duplicate conversions, but
 532          * at least we have all the pixels */
 533         if( i_rewind )
 534         {
 535             p_y -= i_rewind;
 536             p_u -= i_rewind >> 1;
 537             p_v -= i_rewind >> 1;
 538             p_buffer -= i_rewind;
 539
 540             MMX_CALL (
 541                 MMX_INIT_16
 542                 MMX_YUV_MUL
 543                 MMX_YUV_ADD
 544                 MMX_UNPACK_16
 545             );
 546             p_y += 8;
 547             p_u += 4;
 548             p_v += 4;
 549             p_buffer += 8;
 550         }
 551         SCALE_WIDTH;
 552         SCALE_HEIGHT( 420, 2 );
 553
 554         p_y += i_source_margin;
 555         if( i_y % 2 )
 556         {
 557             p_u += i_source_margin_c;
 558             p_v += i_source_margin_c;
 559         }
 560     }
 561     /* re-enable FPU registers */
 562     MMX_END;
 563
 564 #endif /* SSE2 */
 565 }
 566
 567 VLC_TARGET
 568 void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 569                                             picture_t *p_dest )
 570 {
 571     /* We got this one from the old arguments */
 572     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 573     uint8_t  *p_y   = p_src->Y_PIXELS;
 574     uint8_t  *p_u   = p_src->U_PIXELS;
 575     uint8_t  *p_v   = p_src->V_PIXELS;
 576
 577     bool  b_hscale;                         /* horizontal scaling type */
 578     unsigned int i_vscale;                          /* vertical scaling type */
 579     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 580
 581     int         i_right_margin;
 582     int         i_rewind;
 583     int         i_scale_count;                       /* scale modulo counter */
 584     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 585     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 586     /* Conversion buffer pointer */
 587     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 588     uint32_t *  p_buffer;
 589
 590     /* Offset array pointer */
 591     int *       p_offset_start = p_filter->p_sys->p_offset;
 592     int *       p_offset;
 593
 594     const int i_source_margin = p_src->p[0].i_pitch
 595                                  - p_src->p[0].i_visible_pitch
 596                                  - p_filter->fmt_in.video.i_x_offset;
 597     const int i_source_margin_c = p_src->p[1].i_pitch
 598                                  - p_src->p[1].i_visible_pitch
 599                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 600
 601     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 602
 603     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 604      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 605      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 606     SetOffset( p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width,
 607                p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height,
 608                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 609                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 610                &b_hscale, &i_vscale, p_offset_start );
 611
 612     /*
 613      * Perform conversion
 614      */
 615     i_scale_count = ( i_vscale == 1 ) ?
 616                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 617                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 618
 619 #ifdef SSE2
 620
 621     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 622
 623     /*
 624     ** SSE2 128 bits fetch/store instructions are faster
 625     ** if memory access is 16 bytes aligned
 626     */
 627
 628     p_buffer = b_hscale ? p_buffer_start : p_pic;
 629     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 630                     p_dest->p->i_pitch|
 631                     ((intptr_t)p_y)|
 632                     ((intptr_t)p_buffer))) )
 633     {
 634         /* use faster SSE2 aligned fetch and store */
 635         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 636         {
 637             p_pic_start = p_pic;
 638
 639             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 640             {
 641                 SSE2_CALL (
 642                     SSE2_INIT_32_ALIGNED
 643                     SSE2_YUV_MUL
 644                     SSE2_YUV_ADD
 645                     SSE2_UNPACK_32_ARGB_ALIGNED
 646                 );
 647                 p_y += 16;
 648                 p_u += 8;
 649                 p_v += 8;
 650                 p_buffer += 16;
 651             }
 652
 653             /* Here we do some unaligned reads and duplicate conversions, but
 654              * at least we have all the pixels */
 655             if( i_rewind )
 656             {
 657                 p_y -= i_rewind;
 658                 p_u -= i_rewind >> 1;
 659                 p_v -= i_rewind >> 1;
 660                 p_buffer -= i_rewind;
 661                 SSE2_CALL (
 662                     SSE2_INIT_32_UNALIGNED
 663                     SSE2_YUV_MUL
 664                     SSE2_YUV_ADD
 665                     SSE2_UNPACK_32_ARGB_UNALIGNED
 666                 );
 667                 p_y += 16;
 668                 p_u += 4;
 669                 p_v += 4;
 670             }
 671             SCALE_WIDTH;
 672             SCALE_HEIGHT( 420, 4 );
 673
 674             p_y += i_source_margin;
 675             if( i_y % 2 )
 676             {
 677                 p_u += i_source_margin_c;
 678                 p_v += i_source_margin_c;
 679             }
 680             p_buffer = b_hscale ? p_buffer_start : p_pic;
 681         }
 682     }
 683     else
 684     {
 685         /* use slower SSE2 unaligned fetch and store */
 686         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 687         {
 688             p_pic_start = p_pic;
 689             p_buffer = b_hscale ? p_buffer_start : p_pic;
 690
 691             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 692             {
 693                 SSE2_CALL (
 694                     SSE2_INIT_32_UNALIGNED
 695                     SSE2_YUV_MUL
 696                     SSE2_YUV_ADD
 697                     SSE2_UNPACK_32_ARGB_UNALIGNED
 698                 );
 699                 p_y += 16;
 700                 p_u += 8;
 701                 p_v += 8;
 702                 p_buffer += 16;
 703             }
 704
 705             /* Here we do some unaligned reads and duplicate conversions, but
 706              * at least we have all the pixels */
 707             if( i_rewind )
 708             {
 709                 p_y -= i_rewind;
 710                 p_u -= i_rewind >> 1;
 711                 p_v -= i_rewind >> 1;
 712                 p_buffer -= i_rewind;
 713                 SSE2_CALL (
 714                     SSE2_INIT_32_UNALIGNED
 715                     SSE2_YUV_MUL
 716                     SSE2_YUV_ADD
 717                     SSE2_UNPACK_32_ARGB_UNALIGNED
 718                 );
 719                 p_y += 16;
 720                 p_u += 8;
 721                 p_v += 8;
 722             }
 723             SCALE_WIDTH;
 724             SCALE_HEIGHT( 420, 4 );
 725
 726             p_y += i_source_margin;
 727             if( i_y % 2 )
 728             {
 729                 p_u += i_source_margin_c;
 730                 p_v += i_source_margin_c;
 731             }
 732             p_buffer = b_hscale ? p_buffer_start : p_pic;
 733         }
 734     }
 735
 736     /* make sure all SSE2 stores are visible thereafter */
 737     SSE2_END;
 738
 739 #else
 740
 741     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 742
 743     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 744     {
 745         p_pic_start = p_pic;
 746         p_buffer = b_hscale ? p_buffer_start : p_pic;
 747
 748         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 749         {
 750             MMX_CALL (
 751                 MMX_INIT_32
 752                 MMX_YUV_MUL
 753                 MMX_YUV_ADD
 754                 MMX_UNPACK_32_ARGB
 755             );
 756             p_y += 8;
 757             p_u += 4;
 758             p_v += 4;
 759             p_buffer += 8;
 760         }
 761
 762         /* Here we do some unaligned reads and duplicate conversions, but
 763          * at least we have all the pixels */
 764         if( i_rewind )
 765         {
 766             p_y -= i_rewind;
 767             p_u -= i_rewind >> 1;
 768             p_v -= i_rewind >> 1;
 769             p_buffer -= i_rewind;
 770             MMX_CALL (
 771                 MMX_INIT_32
 772                 MMX_YUV_MUL
 773                 MMX_YUV_ADD
 774                 MMX_UNPACK_32_ARGB
 775             );
 776             p_y += 8;
 777             p_u += 4;
 778             p_v += 4;
 779             p_buffer += 8;
 780         }
 781         SCALE_WIDTH;
 782         SCALE_HEIGHT( 420, 4 );
 783
 784         p_y += i_source_margin;
 785         if( i_y % 2 )
 786         {
 787             p_u += i_source_margin_c;
 788             p_v += i_source_margin_c;
 789         }
 790     }
 791
 792     /* re-enable FPU registers */
 793     MMX_END;
 794
 795 #endif
 796 }
 797
 798 VLC_TARGET
 799 void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 800 {
 801     /* We got this one from the old arguments */
 802     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
 803     uint8_t  *p_y   = p_src->Y_PIXELS;
 804     uint8_t  *p_u   = p_src->U_PIXELS;
 805     uint8_t  *p_v   = p_src->V_PIXELS;
 806
 807     bool  b_hscale;                         /* horizontal scaling type */
 808     unsigned int i_vscale;                          /* vertical scaling type */
 809     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
 810
 811     int         i_right_margin;
 812     int         i_rewind;
 813     int         i_scale_count;                       /* scale modulo counter */
 814     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
 815     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
 816     /* Conversion buffer pointer */
 817     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
 818     uint32_t *  p_buffer;
 819
 820     /* Offset array pointer */
 821     int *       p_offset_start = p_filter->p_sys->p_offset;
 822     int *       p_offset;
 823
 824     const int i_source_margin = p_src->p[0].i_pitch
 825                                  - p_src->p[0].i_visible_pitch
 826                                  - p_filter->fmt_in.video.i_x_offset;
 827     const int i_source_margin_c = p_src->p[1].i_pitch
 828                                  - p_src->p[1].i_visible_pitch
 829                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
 830
 831     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
 832
 833     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
 834      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
 835      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
 836     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
 837                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
 838                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
 839                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
 840                &b_hscale, &i_vscale, p_offset_start );
 841
 842     /*
 843      * Perform conversion
 844      */
 845     i_scale_count = ( i_vscale == 1 ) ?
 846                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
 847                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 848
 849 #ifdef SSE2
 850
 851     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
 852
 853     /*
 854     ** SSE2 128 bits fetch/store instructions are faster
 855     ** if memory access is 16 bytes aligned
 856     */
 857
 858     p_buffer = b_hscale ? p_buffer_start : p_pic;
 859     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
 860                     p_dest->p->i_pitch|
 861                     ((intptr_t)p_y)|
 862                     ((intptr_t)p_buffer))) )
 863     {
 864         /* use faster SSE2 aligned fetch and store */
 865         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 866         {
 867             p_pic_start = p_pic;
 868
 869             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 870             {
 871                 SSE2_CALL (
 872                     SSE2_INIT_32_ALIGNED
 873                     SSE2_YUV_MUL
 874                     SSE2_YUV_ADD
 875                     SSE2_UNPACK_32_RGBA_ALIGNED
 876                 );
 877                 p_y += 16;
 878                 p_u += 8;
 879                 p_v += 8;
 880                 p_buffer += 16;
 881             }
 882
 883             /* Here we do some unaligned reads and duplicate conversions, but
 884              * at least we have all the pixels */
 885             if( i_rewind )
 886             {
 887                 p_y -= i_rewind;
 888                 p_u -= i_rewind >> 1;
 889                 p_v -= i_rewind >> 1;
 890                 p_buffer -= i_rewind;
 891                 SSE2_CALL (
 892                     SSE2_INIT_32_UNALIGNED
 893                     SSE2_YUV_MUL
 894                     SSE2_YUV_ADD
 895                     SSE2_UNPACK_32_RGBA_UNALIGNED
 896                 );
 897                 p_y += 16;
 898                 p_u += 4;
 899                 p_v += 4;
 900             }
 901             SCALE_WIDTH;
 902             SCALE_HEIGHT( 420, 4 );
 903
 904             p_y += i_source_margin;
 905             if( i_y % 2 )
 906             {
 907                 p_u += i_source_margin_c;
 908                 p_v += i_source_margin_c;
 909             }
 910             p_buffer = b_hscale ? p_buffer_start : p_pic;
 911         }
 912     }
 913     else
 914     {
 915         /* use slower SSE2 unaligned fetch and store */
 916         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 917         {
 918             p_pic_start = p_pic;
 919             p_buffer = b_hscale ? p_buffer_start : p_pic;
 920
 921             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
 922             {
 923                 SSE2_CALL (
 924                     SSE2_INIT_32_UNALIGNED
 925                     SSE2_YUV_MUL
 926                     SSE2_YUV_ADD
 927                     SSE2_UNPACK_32_RGBA_UNALIGNED
 928                 );
 929                 p_y += 16;
 930                 p_u += 8;
 931                 p_v += 8;
 932                 p_buffer += 16;
 933             }
 934
 935             /* Here we do some unaligned reads and duplicate conversions, but
 936              * at least we have all the pixels */
 937             if( i_rewind )
 938             {
 939                 p_y -= i_rewind;
 940                 p_u -= i_rewind >> 1;
 941                 p_v -= i_rewind >> 1;
 942                 p_buffer -= i_rewind;
 943                 SSE2_CALL (
 944                     SSE2_INIT_32_UNALIGNED
 945                     SSE2_YUV_MUL
 946                     SSE2_YUV_ADD
 947                     SSE2_UNPACK_32_RGBA_UNALIGNED
 948                 );
 949                 p_y += 16;
 950                 p_u += 8;
 951                 p_v += 8;
 952             }
 953             SCALE_WIDTH;
 954             SCALE_HEIGHT( 420, 4 );
 955
 956             p_y += i_source_margin;
 957             if( i_y % 2 )
 958             {
 959                 p_u += i_source_margin_c;
 960                 p_v += i_source_margin_c;
 961             }
 962             p_buffer = b_hscale ? p_buffer_start : p_pic;
 963         }
 964     }
 965
 966     /* make sure all SSE2 stores are visible thereafter */
 967     SSE2_END;
 968
 969 #else
 970
 971     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 972
 973     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
 974     {
 975         p_pic_start = p_pic;
 976         p_buffer = b_hscale ? p_buffer_start : p_pic;
 977
 978         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
 979         {
 980             MMX_CALL (
 981                 MMX_INIT_32
 982                 MMX_YUV_MUL
 983                 MMX_YUV_ADD
 984                 MMX_UNPACK_32_RGBA
 985             );
 986             p_y += 8;
 987             p_u += 4;
 988             p_v += 4;
 989             p_buffer += 8;
 990         }
 991
 992         /* Here we do some unaligned reads and duplicate conversions, but
 993          * at least we have all the pixels */
 994         if( i_rewind )
 995         {
 996             p_y -= i_rewind;
 997             p_u -= i_rewind >> 1;
 998             p_v -= i_rewind >> 1;
 999             p_buffer -= i_rewind;
1000             MMX_CALL (
1001                 MMX_INIT_32
1002                 MMX_YUV_MUL
1003                 MMX_YUV_ADD
1004                 MMX_UNPACK_32_RGBA
1005             );
1006             p_y += 8;
1007             p_u += 4;
1008             p_v += 4;
1009             p_buffer += 8;
1010         }
1011         SCALE_WIDTH;
1012         SCALE_HEIGHT( 420, 4 );
1013
1014         p_y += i_source_margin;
1015         if( i_y % 2 )
1016         {
1017             p_u += i_source_margin_c;
1018             p_v += i_source_margin_c;
1019         }
1020     }
1021
1022     /* re-enable FPU registers */
1023     MMX_END;
1024
1025 #endif
1026 }
1027
1028 VLC_TARGET
1029 void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1030 {
1031     /* We got this one from the old arguments */
1032     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1033     uint8_t  *p_y   = p_src->Y_PIXELS;
1034     uint8_t  *p_u   = p_src->U_PIXELS;
1035     uint8_t  *p_v   = p_src->V_PIXELS;
1036
1037     bool  b_hscale;                         /* horizontal scaling type */
1038     unsigned int i_vscale;                          /* vertical scaling type */
1039     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1040
1041     int         i_right_margin;
1042     int         i_rewind;
1043     int         i_scale_count;                       /* scale modulo counter */
1044     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1045     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1046     /* Conversion buffer pointer */
1047     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1048     uint32_t *  p_buffer;
1049
1050     /* Offset array pointer */
1051     int *       p_offset_start = p_filter->p_sys->p_offset;
1052     int *       p_offset;
1053
1054     const int i_source_margin = p_src->p[0].i_pitch
1055                                  - p_src->p[0].i_visible_pitch
1056                                  - p_filter->fmt_in.video.i_x_offset;
1057     const int i_source_margin_c = p_src->p[1].i_pitch
1058                                  - p_src->p[1].i_visible_pitch
1059                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
1060
1061     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1062
1063     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1064      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1065      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1066     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1067                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1068                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1069                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1070                &b_hscale, &i_vscale, p_offset_start );
1071
1072     /*
1073      * Perform conversion
1074      */
1075     i_scale_count = ( i_vscale == 1 ) ?
1076                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1077                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1078
1079 #ifdef SSE2
1080
1081     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1082
1083     /*
1084     ** SSE2 128 bits fetch/store instructions are faster
1085     ** if memory access is 16 bytes aligned
1086     */
1087
1088     p_buffer = b_hscale ? p_buffer_start : p_pic;
1089     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1090                     p_dest->p->i_pitch|
1091                     ((intptr_t)p_y)|
1092                     ((intptr_t)p_buffer))) )
1093     {
1094         /* use faster SSE2 aligned fetch and store */
1095         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1096         {
1097             p_pic_start = p_pic;
1098
1099             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1100             {
1101                 SSE2_CALL (
1102                     SSE2_INIT_32_ALIGNED
1103                     SSE2_YUV_MUL
1104                     SSE2_YUV_ADD
1105                     SSE2_UNPACK_32_BGRA_ALIGNED
1106                 );
1107                 p_y += 16;
1108                 p_u += 8;
1109                 p_v += 8;
1110                 p_buffer += 16;
1111             }
1112
1113             /* Here we do some unaligned reads and duplicate conversions, but
1114              * at least we have all the pixels */
1115             if( i_rewind )
1116             {
1117                 p_y -= i_rewind;
1118                 p_u -= i_rewind >> 1;
1119                 p_v -= i_rewind >> 1;
1120                 p_buffer -= i_rewind;
1121                 SSE2_CALL (
1122                     SSE2_INIT_32_UNALIGNED
1123                     SSE2_YUV_MUL
1124                     SSE2_YUV_ADD
1125                     SSE2_UNPACK_32_BGRA_UNALIGNED
1126                 );
1127                 p_y += 16;
1128                 p_u += 4;
1129                 p_v += 4;
1130             }
1131             SCALE_WIDTH;
1132             SCALE_HEIGHT( 420, 4 );
1133
1134             p_y += i_source_margin;
1135             if( i_y % 2 )
1136             {
1137                 p_u += i_source_margin_c;
1138                 p_v += i_source_margin_c;
1139             }
1140             p_buffer = b_hscale ? p_buffer_start : p_pic;
1141         }
1142     }
1143     else
1144     {
1145         /* use slower SSE2 unaligned fetch and store */
1146         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1147         {
1148             p_pic_start = p_pic;
1149             p_buffer = b_hscale ? p_buffer_start : p_pic;
1150
1151             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1152             {
1153                 SSE2_CALL (
1154                     SSE2_INIT_32_UNALIGNED
1155                     SSE2_YUV_MUL
1156                     SSE2_YUV_ADD
1157                     SSE2_UNPACK_32_BGRA_UNALIGNED
1158                 );
1159                 p_y += 16;
1160                 p_u += 8;
1161                 p_v += 8;
1162                 p_buffer += 16;
1163             }
1164
1165             /* Here we do some unaligned reads and duplicate conversions, but
1166              * at least we have all the pixels */
1167             if( i_rewind )
1168             {
1169                 p_y -= i_rewind;
1170                 p_u -= i_rewind >> 1;
1171                 p_v -= i_rewind >> 1;
1172                 p_buffer -= i_rewind;
1173                 SSE2_CALL (
1174                     SSE2_INIT_32_UNALIGNED
1175                     SSE2_YUV_MUL
1176                     SSE2_YUV_ADD
1177                     SSE2_UNPACK_32_BGRA_UNALIGNED
1178                 );
1179                 p_y += 16;
1180                 p_u += 8;
1181                 p_v += 8;
1182             }
1183             SCALE_WIDTH;
1184             SCALE_HEIGHT( 420, 4 );
1185
1186             p_y += i_source_margin;
1187             if( i_y % 2 )
1188             {
1189                 p_u += i_source_margin_c;
1190                 p_v += i_source_margin_c;
1191             }
1192             p_buffer = b_hscale ? p_buffer_start : p_pic;
1193         }
1194     }
1195
1196 #else
1197
1198     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1199
1200     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1201     {
1202         p_pic_start = p_pic;
1203         p_buffer = b_hscale ? p_buffer_start : p_pic;
1204
1205         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1206         {
1207             MMX_CALL (
1208                 MMX_INIT_32
1209                 MMX_YUV_MUL
1210                 MMX_YUV_ADD
1211                 MMX_UNPACK_32_BGRA
1212             );
1213             p_y += 8;
1214             p_u += 4;
1215             p_v += 4;
1216             p_buffer += 8;
1217         }
1218
1219         /* Here we do some unaligned reads and duplicate conversions, but
1220          * at least we have all the pixels */
1221         if( i_rewind )
1222         {
1223             p_y -= i_rewind;
1224             p_u -= i_rewind >> 1;
1225             p_v -= i_rewind >> 1;
1226             p_buffer -= i_rewind;
1227             MMX_CALL (
1228                 MMX_INIT_32
1229                 MMX_YUV_MUL
1230                 MMX_YUV_ADD
1231                 MMX_UNPACK_32_BGRA
1232             );
1233             p_y += 8;
1234             p_u += 4;
1235             p_v += 4;
1236             p_buffer += 8;
1237         }
1238         SCALE_WIDTH;
1239         SCALE_HEIGHT( 420, 4 );
1240
1241         p_y += i_source_margin;
1242         if( i_y % 2 )
1243         {
1244             p_u += i_source_margin_c;
1245             p_v += i_source_margin_c;
1246         }
1247     }
1248
1249     /* re-enable FPU registers */
1250     MMX_END;
1251
1252 #endif
1253 }
1254
1255 VLC_TARGET
1256 void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
1257 {
1258     /* We got this one from the old arguments */
1259     uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels;
1260     uint8_t  *p_y   = p_src->Y_PIXELS;
1261     uint8_t  *p_u   = p_src->U_PIXELS;
1262     uint8_t  *p_v   = p_src->V_PIXELS;
1263
1264     bool  b_hscale;                         /* horizontal scaling type */
1265     unsigned int i_vscale;                          /* vertical scaling type */
1266     unsigned int i_x, i_y;                /* horizontal and vertical indexes */
1267
1268     int         i_right_margin;
1269     int         i_rewind;
1270     int         i_scale_count;                       /* scale modulo counter */
1271     int         i_chroma_width = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 2; /* chroma width */
1272     uint32_t *  p_pic_start;       /* beginning of the current line for copy */
1273     /* Conversion buffer pointer */
1274     uint32_t *  p_buffer_start = (uint32_t*)p_filter->p_sys->p_buffer;
1275     uint32_t *  p_buffer;
1276
1277     /* Offset array pointer */
1278     int *       p_offset_start = p_filter->p_sys->p_offset;
1279     int *       p_offset;
1280
1281     const int i_source_margin = p_src->p[0].i_pitch
1282                                  - p_src->p[0].i_visible_pitch
1283                                  - p_filter->fmt_in.video.i_x_offset;
1284     const int i_source_margin_c = p_src->p[1].i_pitch
1285                                  - p_src->p[1].i_visible_pitch
1286                                  - ( p_filter->fmt_in.video.i_x_offset / 2 );
1287
1288     i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch;
1289
1290     /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered
1291      * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1'
1292      * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */
1293     SetOffset( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width),
1294                (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height),
1295                (p_filter->fmt_out.video.i_x_offset + p_filter->fmt_out.video.i_visible_width),
1296                (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height),
1297                &b_hscale, &i_vscale, p_offset_start );
1298
1299     /*
1300      * Perform conversion
1301      */
1302     i_scale_count = ( i_vscale == 1 ) ?
1303                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
1304                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
1305
1306 #ifdef SSE2
1307
1308     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
1309
1310     /*
1311     ** SSE2 128 bits fetch/store instructions are faster
1312     ** if memory access is 16 bytes aligned
1313     */
1314
1315     p_buffer = b_hscale ? p_buffer_start : p_pic;
1316     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
1317                     p_dest->p->i_pitch|
1318                     ((intptr_t)p_y)|
1319                     ((intptr_t)p_buffer))) )
1320     {
1321         /* use faster SSE2 aligned fetch and store */
1322         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1323         {
1324             p_pic_start = p_pic;
1325
1326             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1327             {
1328                 SSE2_CALL (
1329                     SSE2_INIT_32_ALIGNED
1330                     SSE2_YUV_MUL
1331                     SSE2_YUV_ADD
1332                     SSE2_UNPACK_32_ABGR_ALIGNED
1333                 );
1334                 p_y += 16;
1335                 p_u += 8;
1336                 p_v += 8;
1337                 p_buffer += 16;
1338             }
1339
1340             /* Here we do some unaligned reads and duplicate conversions, but
1341              * at least we have all the pixels */
1342             if( i_rewind )
1343             {
1344                 p_y -= i_rewind;
1345                 p_u -= i_rewind >> 1;
1346                 p_v -= i_rewind >> 1;
1347                 p_buffer -= i_rewind;
1348                 SSE2_CALL (
1349                     SSE2_INIT_32_UNALIGNED
1350                     SSE2_YUV_MUL
1351                     SSE2_YUV_ADD
1352                     SSE2_UNPACK_32_ABGR_UNALIGNED
1353                 );
1354                 p_y += 16;
1355                 p_u += 4;
1356                 p_v += 4;
1357             }
1358             SCALE_WIDTH;
1359             SCALE_HEIGHT( 420, 4 );
1360
1361             p_y += i_source_margin;
1362             if( i_y % 2 )
1363             {
1364                 p_u += i_source_margin_c;
1365                 p_v += i_source_margin_c;
1366             }
1367             p_buffer = b_hscale ? p_buffer_start : p_pic;
1368         }
1369     }
1370     else
1371     {
1372         /* use slower SSE2 unaligned fetch and store */
1373         for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1374         {
1375             p_pic_start = p_pic;
1376             p_buffer = b_hscale ? p_buffer_start : p_pic;
1377
1378             for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 16; i_x--; )
1379             {
1380                 SSE2_CALL (
1381                     SSE2_INIT_32_UNALIGNED
1382                     SSE2_YUV_MUL
1383                     SSE2_YUV_ADD
1384                     SSE2_UNPACK_32_ABGR_UNALIGNED
1385                 );
1386                 p_y += 16;
1387                 p_u += 8;
1388                 p_v += 8;
1389                 p_buffer += 16;
1390             }
1391
1392             /* Here we do some unaligned reads and duplicate conversions, but
1393              * at least we have all the pixels */
1394             if( i_rewind )
1395             {
1396                 p_y -= i_rewind;
1397                 p_u -= i_rewind >> 1;
1398                 p_v -= i_rewind >> 1;
1399                 p_buffer -= i_rewind;
1400                 SSE2_CALL (
1401                     SSE2_INIT_32_UNALIGNED
1402                     SSE2_YUV_MUL
1403                     SSE2_YUV_ADD
1404                     SSE2_UNPACK_32_ABGR_UNALIGNED
1405                 );
1406                 p_y += 16;
1407                 p_u += 8;
1408                 p_v += 8;
1409             }
1410             SCALE_WIDTH;
1411             SCALE_HEIGHT( 420, 4 );
1412
1413             p_y += i_source_margin;
1414             if( i_y % 2 )
1415             {
1416                 p_u += i_source_margin_c;
1417                 p_v += i_source_margin_c;
1418             }
1419             p_buffer = b_hscale ? p_buffer_start : p_pic;
1420         }
1421     }
1422
1423 #else
1424
1425     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
1426
1427     for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
1428     {
1429         p_pic_start = p_pic;
1430         p_buffer = b_hscale ? p_buffer_start : p_pic;
1431
1432         for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 8; i_x--; )
1433         {
1434             MMX_CALL (
1435                 MMX_INIT_32
1436                 MMX_YUV_MUL
1437                 MMX_YUV_ADD
1438                 MMX_UNPACK_32_ABGR
1439             );
1440             p_y += 8;
1441             p_u += 4;
1442             p_v += 4;
1443             p_buffer += 8;
1444         }
1445
1446         /* Here we do some unaligned reads and duplicate conversions, but
1447          * at least we have all the pixels */
1448         if( i_rewind )
1449         {
1450             p_y -= i_rewind;
1451             p_u -= i_rewind >> 1;
1452             p_v -= i_rewind >> 1;
1453             p_buffer -= i_rewind;
1454             MMX_CALL (
1455                 MMX_INIT_32
1456                 MMX_YUV_MUL
1457                 MMX_YUV_ADD
1458                 MMX_UNPACK_32_ABGR
1459             );
1460             p_y += 8;
1461             p_u += 4;
1462             p_v += 4;
1463             p_buffer += 8;
1464         }
1465         SCALE_WIDTH;
1466         SCALE_HEIGHT( 420, 4 );
1467
1468         p_y += i_source_margin;
1469         if( i_y % 2 )
1470         {
1471             p_u += i_source_margin_c;
1472             p_v += i_source_margin_c;
1473         }
1474     }
1475
1476     /* re-enable FPU registers */
1477     MMX_END;
1478
1479 #endif
1480 }