plugins/motion/vdec_motion_inner_mmx.c

   1 /*****************************************************************************
   2  * vdec_motion_inner_mmx.c : motion compensation inner routines optimized in
   3  *                           MMX
   4  *****************************************************************************
   5  * Copyright (C) 1999, 2000 VideoLAN
   6  * $Id: vdec_motion_inner_mmx.c,v 1.3 2001/06/07 22:14:55 sam Exp $
   7  *
   8  * Authors: Christophe Massiot <massiot@via.ecp.fr>, largerly inspired by the
   9  *          work done by the livid project <http://www.linuxvideo.org/>
  10  *
  11  * This program is free software; you can redistribute it and/or modify
  12  * it under the terms of the GNU General Public License as published by
  13  * the Free Software Foundation; either version 2 of the License, or
  14  * (at your option) any later version.
  15  *
  16  * This program is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19  * GNU General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU General Public License
  22  * along with this program; if not, write to the Free Software
  23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  24  *****************************************************************************/
  25
  26 #define MODULE_NAME motionmmx
  27 #include "modules_inner.h"
  28
  29 /*****************************************************************************
  30  * Preamble
  31  *****************************************************************************/
  32 #include "defs.h"
  33
  34 #include "config.h"
  35 #include "common.h"
  36 #include "threads.h"
  37 #include "mtime.h"
  38
  39 #include "video.h"
  40
  41 #include "attributes.h"
  42 #include "mmx.h"
  43
  44 /* OK, I know, this code has been taken from livid's mpeg2dec --Meuuh */
  45
  46 /* Some rounding constants */
  47 mmx_t round1 = {0x0001000100010001LL};
  48 mmx_t round4 = {0x0002000200020002LL};
  49
  50 /*
  51  * Useful functions
  52  */
  53
  54 static __inline__ void MMXZeroReg()
  55 {
  56    /* load 0 into mm0 */
  57    pxor_r2r(mm0,mm0);
  58 }
  59
  60 static __inline__ void MMXAverage2( u8 *dst, u8 *src1, u8 *src2 )
  61 {
  62    //
  63    // *dst = clip_to_u8((*src1 + *src2 + 1)/2);
  64    //
  65
  66    movq_m2r(*src1,mm1);        // load 8 src1 bytes
  67    movq_r2r(mm1,mm2);          // copy 8 src1 bytes
  68
  69    movq_m2r(*src2,mm3);        // load 8 src2 bytes
  70    movq_r2r(mm3,mm4);          // copy 8 src2 bytes
  71
  72    punpcklbw_r2r(mm0,mm1);     // unpack low src1 bytes
  73    punpckhbw_r2r(mm0,mm2);     // unpack high src1 bytes
  74
  75    punpcklbw_r2r(mm0,mm3);     // unpack low src2 bytes
  76    punpckhbw_r2r(mm0,mm4);     // unpack high src2 bytes
  77
  78    paddw_r2r(mm3,mm1);         // add lows to mm1
  79    paddw_m2r(round1,mm1);
  80    psraw_i2r(1,mm1);           // /2
  81
  82    paddw_r2r(mm4,mm2);         // add highs to mm2
  83    paddw_m2r(round1,mm2);
  84    psraw_i2r(1,mm2);           // /2
  85
  86    packuswb_r2r(mm2,mm1);      // pack (w/ saturation)
  87    movq_r2m(mm1,*dst);         // store result in dst
  88 }
  89
  90 static __inline__ void MMXInterpAverage2( u8 *dst, u8 *src1, u8 *src2 )
  91 {
  92    //
  93    // *dst = clip_to_u8((*dst + (*src1 + *src2 + 1)/2 + 1)/2);
  94    //
  95
  96    movq_m2r(*dst,mm1);            // load 8 dst bytes
  97    movq_r2r(mm1,mm2);             // copy 8 dst bytes
  98
  99    movq_m2r(*src1,mm3);           // load 8 src1 bytes
 100    movq_r2r(mm3,mm4);             // copy 8 src1 bytes
 101
 102    movq_m2r(*src2,mm5);           // load 8 src2 bytes
 103    movq_r2r(mm5,mm6);             // copy 8 src2 bytes
 104
 105    punpcklbw_r2r(mm0,mm1);        // unpack low dst bytes
 106    punpckhbw_r2r(mm0,mm2);        // unpack high dst bytes
 107
 108    punpcklbw_r2r(mm0,mm3);        // unpack low src1 bytes
 109    punpckhbw_r2r(mm0,mm4);        // unpack high src1 bytes
 110
 111    punpcklbw_r2r(mm0,mm5);        // unpack low src2 bytes
 112    punpckhbw_r2r(mm0,mm6);        // unpack high src2 bytes
 113
 114    paddw_r2r(mm5,mm3);            // add lows
 115    paddw_m2r(round1,mm3);
 116    psraw_i2r(1,mm3);              // /2
 117
 118    paddw_r2r(mm6,mm4);            // add highs
 119    paddw_m2r(round1,mm4);
 120    psraw_i2r(1,mm4);              // /2
 121
 122    paddw_r2r(mm3,mm1);            // add lows
 123    paddw_m2r(round1,mm1);
 124    psraw_i2r(1,mm1);              // /2
 125
 126    paddw_r2r(mm4,mm2);            // add highs
 127    paddw_m2r(round1,mm2);
 128    psraw_i2r(1,mm2);              // /2
 129
 130    packuswb_r2r(mm2,mm1);         // pack (w/ saturation)
 131    movq_r2m(mm1,*dst);            // store result in dst
 132 }
 133
 134 static __inline__ void MMXAverage4( u8 *dst, u8 *src1, u8 *src2, u8 *src3,
 135                                     u8 *src4 )
 136 {
 137    //
 138    // *dst = (*src1 + *src2 + *src3 + *src4 + 2) / 4;
 139    //
 140
 141    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 142    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 143
 144    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 145    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 146
 147    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 148    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 149
 150    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 151    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 152
 153    paddw_r2r(mm3,mm1);                 // add lows
 154    paddw_r2r(mm4,mm2);                 // add highs
 155
 156    // now have partials in mm1 and mm2
 157
 158    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 159    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 160
 161    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 162    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 163
 164    paddw_r2r(mm3,mm1);                 // add lows
 165    paddw_r2r(mm4,mm2);                 // add highs
 166
 167    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 168    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 169
 170    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 171    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 172
 173    paddw_r2r(mm5,mm1);                 // add lows
 174    paddw_r2r(mm6,mm2);                 // add highs
 175
 176    // now have subtotal in mm1 and mm2
 177
 178    paddw_m2r(round4,mm1);
 179    psraw_i2r(2,mm1);                   // /4
 180    paddw_m2r(round4,mm2);
 181    psraw_i2r(2,mm2);                   // /4
 182
 183    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 184    movq_r2m(mm1,*dst);                 // store result in dst
 185 }
 186
 187 static __inline__ void MMXInterpAverage4( u8 *dst, u8 *src1, u8 *src2,
 188                                           u8 *src3, u8 *src4 )
 189 {
 190    //
 191    // *dst = clip_to_u8((*dst + (*src1 + *src2 + *src3 + *src4 + 2)/4 + 1)/2);
 192    //
 193
 194    movq_m2r(*src1,mm1);                // load 8 src1 bytes
 195    movq_r2r(mm1,mm2);                  // copy 8 src1 bytes
 196
 197    punpcklbw_r2r(mm0,mm1);             // unpack low src1 bytes
 198    punpckhbw_r2r(mm0,mm2);             // unpack high src1 bytes
 199
 200    movq_m2r(*src2,mm3);                // load 8 src2 bytes
 201    movq_r2r(mm3,mm4);                  // copy 8 src2 bytes
 202
 203    punpcklbw_r2r(mm0,mm3);             // unpack low src2 bytes
 204    punpckhbw_r2r(mm0,mm4);             // unpack high src2 bytes
 205
 206    paddw_r2r(mm3,mm1);                 // add lows
 207    paddw_r2r(mm4,mm2);                 // add highs
 208
 209    // now have partials in mm1 and mm2
 210
 211    movq_m2r(*src3,mm3);                // load 8 src3 bytes
 212    movq_r2r(mm3,mm4);                  // copy 8 src3 bytes
 213
 214    punpcklbw_r2r(mm0,mm3);             // unpack low src3 bytes
 215    punpckhbw_r2r(mm0,mm4);             // unpack high src3 bytes
 216
 217    paddw_r2r(mm3,mm1);                 // add lows
 218    paddw_r2r(mm4,mm2);                 // add highs
 219
 220    movq_m2r(*src4,mm5);                // load 8 src4 bytes
 221    movq_r2r(mm5,mm6);                  // copy 8 src4 bytes
 222
 223    punpcklbw_r2r(mm0,mm5);             // unpack low src4 bytes
 224    punpckhbw_r2r(mm0,mm6);             // unpack high src4 bytes
 225
 226    paddw_r2r(mm5,mm1);                 // add lows
 227    paddw_r2r(mm6,mm2);                 // add highs
 228
 229    paddw_m2r(round4,mm1);
 230    psraw_i2r(2,mm1);                   // /4
 231    paddw_m2r(round4,mm2);
 232    psraw_i2r(2,mm2);                   // /4
 233
 234    // now have subtotal/4 in mm1 and mm2
 235
 236    movq_m2r(*dst,mm3);                 // load 8 dst bytes
 237    movq_r2r(mm3,mm4);                  // copy 8 dst bytes
 238
 239    punpcklbw_r2r(mm0,mm3);             // unpack low dst bytes
 240    punpckhbw_r2r(mm0,mm4);             // unpack high dst bytes
 241
 242    paddw_r2r(mm3,mm1);                 // add lows
 243    paddw_r2r(mm4,mm2);                 // add highs
 244
 245    paddw_m2r(round1,mm1);
 246    psraw_i2r(1,mm1);                   // /2
 247    paddw_m2r(round1,mm2);
 248    psraw_i2r(1,mm2);                   // /2
 249
 250    // now have end value in mm1 and mm2
 251
 252    packuswb_r2r(mm2,mm1);              // pack (w/ saturation)
 253    movq_r2m(mm1,*dst);                 // store result in dst
 254 }
 255
 256
 257 /*
 258  * Actual Motion compensation
 259  */
 260
 261 #define pavg_r2r(src,dest)      pavgusb_r2r (src, dest);
 262 #define pavg_m2r(src,dest)      pavgusb_m2r (src, dest);
 263
 264 #define __MotionComponent_x_y_copy(width,height)                            \
 265 void _M(MotionComponent_x_y_copy_##width##_##height)(yuv_data_t * p_src,    \
 266                                                  yuv_data_t * p_dest,       \
 267                                                  int i_stride)              \
 268 {                                                                           \
 269     int i_y;                                                                \
 270                                                                             \
 271     MMXZeroReg();                                                           \
 272                                                                             \
 273     for( i_y = 0; i_y < height; i_y ++ )                                    \
 274     {                                                                       \
 275         movq_m2r( *p_src, mm0 );     /* load 8 ref bytes */                 \
 276         if( width == 16 )                                                   \
 277             movq_m2r( *(p_src + 8), mm1 );                                  \
 278         p_src += i_stride;                                                  \
 279                                                                             \
 280         movq_r2m( mm0, *p_dest );    /* store 8 bytes at curr */            \
 281         if( width == 16 )                                                   \
 282             movq_r2m( mm1, *(p_dest + 8) );                                 \
 283         p_dest += i_stride;                                                 \
 284     }                                                                       \
 285 }
 286
 287 #define __MotionComponent_X_y_copy(width,height)                            \
 288 void _M(MotionComponent_X_y_copy_##width##_##height)(yuv_data_t * p_src,    \
 289                                                  yuv_data_t * p_dest,       \
 290                                                  int i_stride)              \
 291 {                                                                           \
 292     int i_y;                                                                \
 293                                                                             \
 294     MMXZeroReg();                                                           \
 295                                                                             \
 296     for( i_y = 0; i_y < height; i_y ++ )                                    \
 297     {                                                                       \
 298         MMXAverage2( p_dest, p_src, p_src + 1 );                            \
 299                                                                             \
 300         if( width == 16 )                                                   \
 301         {                                                                   \
 302             MMXAverage2( p_dest + 8, p_src + 8, p_src + 9 );                \
 303         }                                                                   \
 304                                                                             \
 305         p_dest += i_stride;                                                 \
 306         p_src += i_stride;                                                  \
 307     }                                                                       \
 308 }
 309
 310 #define __MotionComponent_x_Y_copy(width,height)                            \
 311 void _M(MotionComponent_x_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
 312                                                  yuv_data_t * p_dest,       \
 313                                                  int i_stride)              \
 314 {                                                                           \
 315     int i_y;                                                                \
 316     yuv_data_t * p_next_src = p_src + i_stride;                             \
 317                                                                             \
 318     MMXZeroReg();                                                           \
 319                                                                             \
 320     for( i_y = 0; i_y < height; i_y ++ )                                    \
 321     {                                                                       \
 322         MMXAverage2( p_dest, p_src, p_next_src );                           \
 323                                                                             \
 324         if( width == 16 )                                                   \
 325         {                                                                   \
 326             MMXAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );           \
 327         }                                                                   \
 328                                                                             \
 329         p_dest += i_stride;                                                 \
 330         p_src += i_stride;                                                  \
 331         p_next_src += i_stride;                                             \
 332     }                                                                       \
 333 }
 334
 335 #define __MotionComponent_X_Y_copy(width,height)                            \
 336 void _M(MotionComponent_X_Y_copy_##width##_##height)(yuv_data_t * p_src,    \
 337                                                  yuv_data_t * p_dest,       \
 338                                                  int i_stride)              \
 339 {                                                                           \
 340     int i_y;                                                                \
 341     yuv_data_t * p_next_src = p_src + i_stride;                             \
 342                                                                             \
 343     MMXZeroReg();                                                           \
 344                                                                             \
 345     for( i_y = 0; i_y < height; i_y ++ )                                    \
 346     {                                                                       \
 347         MMXAverage4( p_dest, p_src, p_src + 1, p_next_src, p_next_src + 1 );\
 348                                                                             \
 349         if( width == 16 )                                                   \
 350         {                                                                   \
 351             MMXAverage4( p_dest + 8, p_src + 8, p_src + 9,                  \
 352                          p_next_src + 8, p_next_src + 9 );                  \
 353         }                                                                   \
 354                                                                             \
 355         p_dest += i_stride;                                                 \
 356         p_src += i_stride;                                                  \
 357         p_next_src += i_stride;                                             \
 358     }                                                                       \
 359 }
 360
 361 #define __MotionComponent_x_y_avg(width,height)                             \
 362 void _M(MotionComponent_x_y_avg_##width##_##height)(yuv_data_t * p_src,     \
 363                                                 yuv_data_t * p_dest,        \
 364                                                 int i_stride)               \
 365 {                                                                           \
 366     int i_y;                                                                \
 367                                                                             \
 368     MMXZeroReg();                                                           \
 369                                                                             \
 370     for( i_y = 0; i_y < height; i_y ++ )                                    \
 371     {                                                                       \
 372         MMXAverage2( p_dest, p_dest, p_src );                               \
 373                                                                             \
 374         if( width == 16 )                                                   \
 375         {                                                                   \
 376             MMXAverage2( p_dest + 8, p_dest + 8, p_src + 8 );               \
 377         }                                                                   \
 378                                                                             \
 379         p_dest += i_stride;                                                 \
 380         p_src += i_stride;                                                  \
 381     }                                                                       \
 382 }
 383
 384 #define __MotionComponent_X_y_avg(width,height)                             \
 385 void _M(MotionComponent_X_y_avg_##width##_##height)(yuv_data_t * p_src,     \
 386                                                 yuv_data_t * p_dest,        \
 387                                                 int i_stride)               \
 388 {                                                                           \
 389     int i_y;                                                                \
 390                                                                             \
 391     MMXZeroReg();                                                           \
 392                                                                             \
 393     for( i_y = 0; i_y < height; i_y ++ )                                    \
 394     {                                                                       \
 395         MMXInterpAverage2( p_dest, p_src, p_src + 1 );                      \
 396                                                                             \
 397         if( width == 16 )                                                   \
 398         {                                                                   \
 399             MMXInterpAverage2( p_dest + 8, p_src + 8, p_src + 9 );          \
 400         }                                                                   \
 401                                                                             \
 402         p_dest += i_stride;                                                 \
 403         p_src += i_stride;                                                  \
 404     }                                                                       \
 405 }
 406
 407 #define __MotionComponent_x_Y_avg(width,height)                             \
 408 void _M(MotionComponent_x_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
 409                                                 yuv_data_t * p_dest,        \
 410                                                 int i_stride)               \
 411 {                                                                           \
 412     int i_y;                                                                \
 413     yuv_data_t * p_next_src = p_src + i_stride;                             \
 414                                                                             \
 415     MMXZeroReg();                                                           \
 416                                                                             \
 417     for( i_y = 0; i_y < height; i_y ++ )                                    \
 418     {                                                                       \
 419         MMXInterpAverage2( p_dest, p_src, p_next_src );                     \
 420                                                                             \
 421         if( width == 16 )                                                   \
 422         {                                                                   \
 423             MMXInterpAverage2( p_dest + 8, p_src + 8, p_next_src + 8 );     \
 424         }                                                                   \
 425         p_dest += i_stride;                                                 \
 426         p_src += i_stride;                                                  \
 427         p_next_src += i_stride;                                             \
 428     }                                                                       \
 429 }
 430
 431 #define __MotionComponent_X_Y_avg(width,height)                             \
 432 void _M(MotionComponent_X_Y_avg_##width##_##height)(yuv_data_t * p_src,     \
 433                                                 yuv_data_t * p_dest,        \
 434                                                 int i_stride)               \
 435 {                                                                           \
 436     int i_y;                                                                \
 437     yuv_data_t * p_next_src = p_src + i_stride;                             \
 438                                                                             \
 439     MMXZeroReg();                                                           \
 440                                                                             \
 441     for( i_y = 0; i_y < height; i_y ++ )                                    \
 442     {                                                                       \
 443         MMXInterpAverage4( p_dest, p_src, p_src + 1, p_next_src,            \
 444                            p_next_src + 1 );                                \
 445                                                                             \
 446         if( width == 16 )                                                   \
 447         {                                                                   \
 448             MMXInterpAverage4( p_dest + 8, p_src + 8, p_src + 9,            \
 449                                p_next_src + 8, p_next_src + 9 );            \
 450         }                                                                   \
 451                                                                             \
 452         p_dest += i_stride;                                                 \
 453         p_src += i_stride;                                                  \
 454         p_next_src += i_stride;                                             \
 455     }                                                                       \
 456 }
 457
 458 #define __MotionComponents(width,height)                                    \
 459 __MotionComponent_x_y_copy(width,height)                                    \
 460 __MotionComponent_X_y_copy(width,height)                                    \
 461 __MotionComponent_x_Y_copy(width,height)                                    \
 462 __MotionComponent_X_Y_copy(width,height)                                    \
 463 __MotionComponent_x_y_avg(width,height)                                     \
 464 __MotionComponent_X_y_avg(width,height)                                     \
 465 __MotionComponent_x_Y_avg(width,height)                                     \
 466 __MotionComponent_X_Y_avg(width,height)
 467
 468 __MotionComponents (16,16)      /* 444, 422, 420 */
 469 __MotionComponents (16,8)       /* 444, 422, 420 */
 470 __MotionComponents (8,8)        /* 422, 420 */
 471 __MotionComponents (8,4)        /* 420 */
 472 #if 0
 473 __MotionComponents (8,16)       /* 422 */
 474 #endif