extern/ffmpeg/libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  *
  20  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
  21  */
  22
  23 /**
  24  * @file dsputil.c
  25  * DSP utils
  26  */
  27
  28 #include "avcodec.h"
  29 #include "dsputil.h"
  30 #include "mpegvideo.h"
  31 #include "simple_idct.h"
  32 #include "faandct.h"
  33 #include "snow.h"
  34
  35 /* snow.c */
  36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  37
  38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  39 uint32_t squareTbl[512] = {0, };
  40
  41 const uint8_t ff_zigzag_direct[64] = {
  42     0,   1,  8, 16,  9,  2,  3, 10,
  43     17, 24, 32, 25, 18, 11,  4,  5,
  44     12, 19, 26, 33, 40, 48, 41, 34,
  45     27, 20, 13,  6,  7, 14, 21, 28,
  46     35, 42, 49, 56, 57, 50, 43, 36,
  47     29, 22, 15, 23, 30, 37, 44, 51,
  48     58, 59, 52, 45, 38, 31, 39, 46,
  49     53, 60, 61, 54, 47, 55, 62, 63
  50 };
  51
  52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  53    specification, we interleave the fields */
  54 const uint8_t ff_zigzag248_direct[64] = {
  55      0,  8,  1,  9, 16, 24,  2, 10,
  56     17, 25, 32, 40, 48, 56, 33, 41,
  57     18, 26,  3, 11,  4, 12, 19, 27,
  58     34, 42, 49, 57, 50, 58, 35, 43,
  59     20, 28,  5, 13,  6, 14, 21, 29,
  60     36, 44, 51, 59, 52, 60, 37, 45,
  61     22, 30,  7, 15, 23, 31, 38, 46,
  62     53, 61, 54, 62, 39, 47, 55, 63,
  63 };
  64
  65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  67
  68 const uint8_t ff_alternate_horizontal_scan[64] = {
  69     0,  1,   2,  3,  8,  9, 16, 17,
  70     10, 11,  4,  5,  6,  7, 15, 14,
  71     13, 12, 19, 18, 24, 25, 32, 33,
  72     26, 27, 20, 21, 22, 23, 28, 29,
  73     30, 31, 34, 35, 40, 41, 48, 49,
  74     42, 43, 36, 37, 38, 39, 44, 45,
  75     46, 47, 50, 51, 56, 57, 58, 59,
  76     52, 53, 54, 55, 60, 61, 62, 63,
  77 };
  78
  79 const uint8_t ff_alternate_vertical_scan[64] = {
  80     0,  8,  16, 24,  1,  9,  2, 10,
  81     17, 25, 32, 40, 48, 56, 57, 49,
  82     41, 33, 26, 18,  3, 11,  4, 12,
  83     19, 27, 34, 42, 50, 58, 35, 43,
  84     51, 59, 20, 28,  5, 13,  6, 14,
  85     21, 29, 36, 44, 52, 60, 37, 45,
  86     53, 61, 22, 30,  7, 15, 23, 31,
  87     38, 46, 54, 62, 39, 47, 55, 63,
  88 };
  89
  90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
  91 const uint32_t inverse[256]={
  92          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
  93  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
  94  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
  95  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
  96  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
  97  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
  98   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
  99   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 100   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 101   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 102   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 103   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 104   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 105   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 106   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 107   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 108   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 109   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 110   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 111   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 112   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 113   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 114   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 115   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 116   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 117   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 118   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 119   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 120   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 121   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 122   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 123   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 124 };
 125
 126 /* Input permutation for the simple_idct_mmx */
 127 static const uint8_t simple_mmx_permutation[64]={
 128         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 129         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 130         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 131         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 132         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 133         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 134         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 135         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 136 };
 137
 138 static int pix_sum_c(uint8_t * pix, int line_size)
 139 {
 140     int s, i, j;
 141
 142     s = 0;
 143     for (i = 0; i < 16; i++) {
 144         for (j = 0; j < 16; j += 8) {
 145             s += pix[0];
 146             s += pix[1];
 147             s += pix[2];
 148             s += pix[3];
 149             s += pix[4];
 150             s += pix[5];
 151             s += pix[6];
 152             s += pix[7];
 153             pix += 8;
 154         }
 155         pix += line_size - 16;
 156     }
 157     return s;
 158 }
 159
 160 static int pix_norm1_c(uint8_t * pix, int line_size)
 161 {
 162     int s, i, j;
 163     uint32_t *sq = squareTbl + 256;
 164
 165     s = 0;
 166     for (i = 0; i < 16; i++) {
 167         for (j = 0; j < 16; j += 8) {
 168 #if 0
 169             s += sq[pix[0]];
 170             s += sq[pix[1]];
 171             s += sq[pix[2]];
 172             s += sq[pix[3]];
 173             s += sq[pix[4]];
 174             s += sq[pix[5]];
 175             s += sq[pix[6]];
 176             s += sq[pix[7]];
 177 #else
 178 #if LONG_MAX > 2147483647
 179             register uint64_t x=*(uint64_t*)pix;
 180             s += sq[x&0xff];
 181             s += sq[(x>>8)&0xff];
 182             s += sq[(x>>16)&0xff];
 183             s += sq[(x>>24)&0xff];
 184             s += sq[(x>>32)&0xff];
 185             s += sq[(x>>40)&0xff];
 186             s += sq[(x>>48)&0xff];
 187             s += sq[(x>>56)&0xff];
 188 #else
 189             register uint32_t x=*(uint32_t*)pix;
 190             s += sq[x&0xff];
 191             s += sq[(x>>8)&0xff];
 192             s += sq[(x>>16)&0xff];
 193             s += sq[(x>>24)&0xff];
 194             x=*(uint32_t*)(pix+4);
 195             s += sq[x&0xff];
 196             s += sq[(x>>8)&0xff];
 197             s += sq[(x>>16)&0xff];
 198             s += sq[(x>>24)&0xff];
 199 #endif
 200 #endif
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
 209     int i;
 210
 211     for(i=0; i+8<=w; i+=8){
 212         dst[i+0]= bswap_32(src[i+0]);
 213         dst[i+1]= bswap_32(src[i+1]);
 214         dst[i+2]= bswap_32(src[i+2]);
 215         dst[i+3]= bswap_32(src[i+3]);
 216         dst[i+4]= bswap_32(src[i+4]);
 217         dst[i+5]= bswap_32(src[i+5]);
 218         dst[i+6]= bswap_32(src[i+6]);
 219         dst[i+7]= bswap_32(src[i+7]);
 220     }
 221     for(;i<w; i++){
 222         dst[i+0]= bswap_32(src[i+0]);
 223     }
 224 }
 225
 226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 227 {
 228     int s, i;
 229     uint32_t *sq = squareTbl + 256;
 230
 231     s = 0;
 232     for (i = 0; i < h; i++) {
 233         s += sq[pix1[0] - pix2[0]];
 234         s += sq[pix1[1] - pix2[1]];
 235         s += sq[pix1[2] - pix2[2]];
 236         s += sq[pix1[3] - pix2[3]];
 237         pix1 += line_size;
 238         pix2 += line_size;
 239     }
 240     return s;
 241 }
 242
 243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 244 {
 245     int s, i;
 246     uint32_t *sq = squareTbl + 256;
 247
 248     s = 0;
 249     for (i = 0; i < h; i++) {
 250         s += sq[pix1[0] - pix2[0]];
 251         s += sq[pix1[1] - pix2[1]];
 252         s += sq[pix1[2] - pix2[2]];
 253         s += sq[pix1[3] - pix2[3]];
 254         s += sq[pix1[4] - pix2[4]];
 255         s += sq[pix1[5] - pix2[5]];
 256         s += sq[pix1[6] - pix2[6]];
 257         s += sq[pix1[7] - pix2[7]];
 258         pix1 += line_size;
 259         pix2 += line_size;
 260     }
 261     return s;
 262 }
 263
 264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[ 0] - pix2[ 0]];
 272         s += sq[pix1[ 1] - pix2[ 1]];
 273         s += sq[pix1[ 2] - pix2[ 2]];
 274         s += sq[pix1[ 3] - pix2[ 3]];
 275         s += sq[pix1[ 4] - pix2[ 4]];
 276         s += sq[pix1[ 5] - pix2[ 5]];
 277         s += sq[pix1[ 6] - pix2[ 6]];
 278         s += sq[pix1[ 7] - pix2[ 7]];
 279         s += sq[pix1[ 8] - pix2[ 8]];
 280         s += sq[pix1[ 9] - pix2[ 9]];
 281         s += sq[pix1[10] - pix2[10]];
 282         s += sq[pix1[11] - pix2[11]];
 283         s += sq[pix1[12] - pix2[12]];
 284         s += sq[pix1[13] - pix2[13]];
 285         s += sq[pix1[14] - pix2[14]];
 286         s += sq[pix1[15] - pix2[15]];
 287
 288         pix1 += line_size;
 289         pix2 += line_size;
 290     }
 291     return s;
 292 }
 293
 294
 295 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 296 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 297     int s, i, j;
 298     const int dec_count= w==8 ? 3 : 4;
 299     int tmp[32*32];
 300     int level, ori;
 301     static const int scale[2][2][4][4]={
 302       {
 303         {
 304             // 9/7 8x8 dec=3
 305             {268, 239, 239, 213},
 306             {  0, 224, 224, 152},
 307             {  0, 135, 135, 110},
 308         },{
 309             // 9/7 16x16 or 32x32 dec=4
 310             {344, 310, 310, 280},
 311             {  0, 320, 320, 228},
 312             {  0, 175, 175, 136},
 313             {  0, 129, 129, 102},
 314         }
 315       },{
 316         {
 317             // 5/3 8x8 dec=3
 318             {275, 245, 245, 218},
 319             {  0, 230, 230, 156},
 320             {  0, 138, 138, 113},
 321         },{
 322             // 5/3 16x16 or 32x32 dec=4
 323             {352, 317, 317, 286},
 324             {  0, 328, 328, 233},
 325             {  0, 180, 180, 140},
 326             {  0, 132, 132, 105},
 327         }
 328       }
 329     };
 330
 331     for (i = 0; i < h; i++) {
 332         for (j = 0; j < w; j+=4) {
 333             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 334             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 335             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 336             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 337         }
 338         pix1 += line_size;
 339         pix2 += line_size;
 340     }
 341
 342     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 343
 344     s=0;
 345     assert(w==h);
 346     for(level=0; level<dec_count; level++){
 347         for(ori= level ? 1 : 0; ori<4; ori++){
 348             int size= w>>(dec_count-level);
 349             int sx= (ori&1) ? size : 0;
 350             int stride= 32<<(dec_count-level);
 351             int sy= (ori&2) ? stride>>1 : 0;
 352
 353             for(i=0; i<size; i++){
 354                 for(j=0; j<size; j++){
 355                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 356                     s += ABS(v);
 357                 }
 358             }
 359         }
 360     }
 361     assert(s>=0);
 362     return s>>9;
 363 #endif
 364 }
 365
 366 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 367     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 368 }
 369
 370 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 371     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 372 }
 373
 374 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 375     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 376 }
 377
 378 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 379     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 380 }
 381
 382 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 383     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 384 }
 385
 386 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 387     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 388 }
 389
 390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 391 {
 392     int i;
 393
 394     /* read the pixels */
 395     for(i=0;i<8;i++) {
 396         block[0] = pixels[0];
 397         block[1] = pixels[1];
 398         block[2] = pixels[2];
 399         block[3] = pixels[3];
 400         block[4] = pixels[4];
 401         block[5] = pixels[5];
 402         block[6] = pixels[6];
 403         block[7] = pixels[7];
 404         pixels += line_size;
 405         block += 8;
 406     }
 407 }
 408
 409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 410                           const uint8_t *s2, int stride){
 411     int i;
 412
 413     /* read the pixels */
 414     for(i=0;i<8;i++) {
 415         block[0] = s1[0] - s2[0];
 416         block[1] = s1[1] - s2[1];
 417         block[2] = s1[2] - s2[2];
 418         block[3] = s1[3] - s2[3];
 419         block[4] = s1[4] - s2[4];
 420         block[5] = s1[5] - s2[5];
 421         block[6] = s1[6] - s2[6];
 422         block[7] = s1[7] - s2[7];
 423         s1 += stride;
 424         s2 += stride;
 425         block += 8;
 426     }
 427 }
 428
 429
 430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 431                                  int line_size)
 432 {
 433     int i;
 434     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 435
 436     /* read the pixels */
 437     for(i=0;i<8;i++) {
 438         pixels[0] = cm[block[0]];
 439         pixels[1] = cm[block[1]];
 440         pixels[2] = cm[block[2]];
 441         pixels[3] = cm[block[3]];
 442         pixels[4] = cm[block[4]];
 443         pixels[5] = cm[block[5]];
 444         pixels[6] = cm[block[6]];
 445         pixels[7] = cm[block[7]];
 446
 447         pixels += line_size;
 448         block += 8;
 449     }
 450 }
 451
 452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 453                                  int line_size)
 454 {
 455     int i;
 456     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 457
 458     /* read the pixels */
 459     for(i=0;i<4;i++) {
 460         pixels[0] = cm[block[0]];
 461         pixels[1] = cm[block[1]];
 462         pixels[2] = cm[block[2]];
 463         pixels[3] = cm[block[3]];
 464
 465         pixels += line_size;
 466         block += 8;
 467     }
 468 }
 469
 470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 471                                  int line_size)
 472 {
 473     int i;
 474     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 475
 476     /* read the pixels */
 477     for(i=0;i<2;i++) {
 478         pixels[0] = cm[block[0]];
 479         pixels[1] = cm[block[1]];
 480
 481         pixels += line_size;
 482         block += 8;
 483     }
 484 }
 485
 486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 487                                         uint8_t *restrict pixels,
 488                                         int line_size)
 489 {
 490     int i, j;
 491
 492     for (i = 0; i < 8; i++) {
 493         for (j = 0; j < 8; j++) {
 494             if (*block < -128)
 495                 *pixels = 0;
 496             else if (*block > 127)
 497                 *pixels = 255;
 498             else
 499                 *pixels = (uint8_t)(*block + 128);
 500             block++;
 501             pixels++;
 502         }
 503         pixels += (line_size - 8);
 504     }
 505 }
 506
 507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 508                           int line_size)
 509 {
 510     int i;
 511     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 512
 513     /* read the pixels */
 514     for(i=0;i<8;i++) {
 515         pixels[0] = cm[pixels[0] + block[0]];
 516         pixels[1] = cm[pixels[1] + block[1]];
 517         pixels[2] = cm[pixels[2] + block[2]];
 518         pixels[3] = cm[pixels[3] + block[3]];
 519         pixels[4] = cm[pixels[4] + block[4]];
 520         pixels[5] = cm[pixels[5] + block[5]];
 521         pixels[6] = cm[pixels[6] + block[6]];
 522         pixels[7] = cm[pixels[7] + block[7]];
 523         pixels += line_size;
 524         block += 8;
 525     }
 526 }
 527
 528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 529                           int line_size)
 530 {
 531     int i;
 532     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 533
 534     /* read the pixels */
 535     for(i=0;i<4;i++) {
 536         pixels[0] = cm[pixels[0] + block[0]];
 537         pixels[1] = cm[pixels[1] + block[1]];
 538         pixels[2] = cm[pixels[2] + block[2]];
 539         pixels[3] = cm[pixels[3] + block[3]];
 540         pixels += line_size;
 541         block += 8;
 542     }
 543 }
 544
 545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 546                           int line_size)
 547 {
 548     int i;
 549     uint8_t *cm = cropTbl + MAX_NEG_CROP;
 550
 551     /* read the pixels */
 552     for(i=0;i<2;i++) {
 553         pixels[0] = cm[pixels[0] + block[0]];
 554         pixels[1] = cm[pixels[1] + block[1]];
 555         pixels += line_size;
 556         block += 8;
 557     }
 558 }
 559
 560 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 561 {
 562     int i;
 563     for(i=0;i<8;i++) {
 564         pixels[0] += block[0];
 565         pixels[1] += block[1];
 566         pixels[2] += block[2];
 567         pixels[3] += block[3];
 568         pixels[4] += block[4];
 569         pixels[5] += block[5];
 570         pixels[6] += block[6];
 571         pixels[7] += block[7];
 572         pixels += line_size;
 573         block += 8;
 574     }
 575 }
 576
 577 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 578 {
 579     int i;
 580     for(i=0;i<4;i++) {
 581         pixels[0] += block[0];
 582         pixels[1] += block[1];
 583         pixels[2] += block[2];
 584         pixels[3] += block[3];
 585         pixels += line_size;
 586         block += 4;
 587     }
 588 }
 589
 590 #if 0
 591
 592 #define PIXOP2(OPNAME, OP) \
 593 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 594 {\
 595     int i;\
 596     for(i=0; i<h; i++){\
 597         OP(*((uint64_t*)block), LD64(pixels));\
 598         pixels+=line_size;\
 599         block +=line_size;\
 600     }\
 601 }\
 602 \
 603 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 604 {\
 605     int i;\
 606     for(i=0; i<h; i++){\
 607         const uint64_t a= LD64(pixels  );\
 608         const uint64_t b= LD64(pixels+1);\
 609         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 610         pixels+=line_size;\
 611         block +=line_size;\
 612     }\
 613 }\
 614 \
 615 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 616 {\
 617     int i;\
 618     for(i=0; i<h; i++){\
 619         const uint64_t a= LD64(pixels  );\
 620         const uint64_t b= LD64(pixels+1);\
 621         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 622         pixels+=line_size;\
 623         block +=line_size;\
 624     }\
 625 }\
 626 \
 627 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 628 {\
 629     int i;\
 630     for(i=0; i<h; i++){\
 631         const uint64_t a= LD64(pixels          );\
 632         const uint64_t b= LD64(pixels+line_size);\
 633         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 634         pixels+=line_size;\
 635         block +=line_size;\
 636     }\
 637 }\
 638 \
 639 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 640 {\
 641     int i;\
 642     for(i=0; i<h; i++){\
 643         const uint64_t a= LD64(pixels          );\
 644         const uint64_t b= LD64(pixels+line_size);\
 645         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 646         pixels+=line_size;\
 647         block +=line_size;\
 648     }\
 649 }\
 650 \
 651 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 652 {\
 653         int i;\
 654         const uint64_t a= LD64(pixels  );\
 655         const uint64_t b= LD64(pixels+1);\
 656         uint64_t l0=  (a&0x0303030303030303ULL)\
 657                     + (b&0x0303030303030303ULL)\
 658                     + 0x0202020202020202ULL;\
 659         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 660                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 661         uint64_t l1,h1;\
 662 \
 663         pixels+=line_size;\
 664         for(i=0; i<h; i+=2){\
 665             uint64_t a= LD64(pixels  );\
 666             uint64_t b= LD64(pixels+1);\
 667             l1=  (a&0x0303030303030303ULL)\
 668                + (b&0x0303030303030303ULL);\
 669             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 670               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 671             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 672             pixels+=line_size;\
 673             block +=line_size;\
 674             a= LD64(pixels  );\
 675             b= LD64(pixels+1);\
 676             l0=  (a&0x0303030303030303ULL)\
 677                + (b&0x0303030303030303ULL)\
 678                + 0x0202020202020202ULL;\
 679             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 680               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 681             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 682             pixels+=line_size;\
 683             block +=line_size;\
 684         }\
 685 }\
 686 \
 687 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 688 {\
 689         int i;\
 690         const uint64_t a= LD64(pixels  );\
 691         const uint64_t b= LD64(pixels+1);\
 692         uint64_t l0=  (a&0x0303030303030303ULL)\
 693                     + (b&0x0303030303030303ULL)\
 694                     + 0x0101010101010101ULL;\
 695         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 696                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 697         uint64_t l1,h1;\
 698 \
 699         pixels+=line_size;\
 700         for(i=0; i<h; i+=2){\
 701             uint64_t a= LD64(pixels  );\
 702             uint64_t b= LD64(pixels+1);\
 703             l1=  (a&0x0303030303030303ULL)\
 704                + (b&0x0303030303030303ULL);\
 705             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 706               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 707             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 708             pixels+=line_size;\
 709             block +=line_size;\
 710             a= LD64(pixels  );\
 711             b= LD64(pixels+1);\
 712             l0=  (a&0x0303030303030303ULL)\
 713                + (b&0x0303030303030303ULL)\
 714                + 0x0101010101010101ULL;\
 715             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 716               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 717             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 718             pixels+=line_size;\
 719             block +=line_size;\
 720         }\
 721 }\
 722 \
 723 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 724 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 725 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 726 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 727 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 728 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 729 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 730
 731 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 732 #else // 64 bit variant
 733
 734 #define PIXOP2(OPNAME, OP) \
 735 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 736     int i;\
 737     for(i=0; i<h; i++){\
 738         OP(*((uint16_t*)(block  )), LD16(pixels  ));\
 739         pixels+=line_size;\
 740         block +=line_size;\
 741     }\
 742 }\
 743 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 744     int i;\
 745     for(i=0; i<h; i++){\
 746         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 747         pixels+=line_size;\
 748         block +=line_size;\
 749     }\
 750 }\
 751 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 752     int i;\
 753     for(i=0; i<h; i++){\
 754         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
 755         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
 756         pixels+=line_size;\
 757         block +=line_size;\
 758     }\
 759 }\
 760 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 761     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 762 }\
 763 \
 764 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 765                                                 int src_stride1, int src_stride2, int h){\
 766     int i;\
 767     for(i=0; i<h; i++){\
 768         uint32_t a,b;\
 769         a= LD32(&src1[i*src_stride1  ]);\
 770         b= LD32(&src2[i*src_stride2  ]);\
 771         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 772         a= LD32(&src1[i*src_stride1+4]);\
 773         b= LD32(&src2[i*src_stride2+4]);\
 774         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 775     }\
 776 }\
 777 \
 778 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 779                                                 int src_stride1, int src_stride2, int h){\
 780     int i;\
 781     for(i=0; i<h; i++){\
 782         uint32_t a,b;\
 783         a= LD32(&src1[i*src_stride1  ]);\
 784         b= LD32(&src2[i*src_stride2  ]);\
 785         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 786         a= LD32(&src1[i*src_stride1+4]);\
 787         b= LD32(&src2[i*src_stride2+4]);\
 788         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 789     }\
 790 }\
 791 \
 792 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 793                                                 int src_stride1, int src_stride2, int h){\
 794     int i;\
 795     for(i=0; i<h; i++){\
 796         uint32_t a,b;\
 797         a= LD32(&src1[i*src_stride1  ]);\
 798         b= LD32(&src2[i*src_stride2  ]);\
 799         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 800     }\
 801 }\
 802 \
 803 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 804                                                 int src_stride1, int src_stride2, int h){\
 805     int i;\
 806     for(i=0; i<h; i++){\
 807         uint32_t a,b;\
 808         a= LD16(&src1[i*src_stride1  ]);\
 809         b= LD16(&src2[i*src_stride2  ]);\
 810         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 811     }\
 812 }\
 813 \
 814 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 815                                                 int src_stride1, int src_stride2, int h){\
 816     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 817     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 818 }\
 819 \
 820 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 821                                                 int src_stride1, int src_stride2, int h){\
 822     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 823     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 824 }\
 825 \
 826 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 827     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 828 }\
 829 \
 830 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 831     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 832 }\
 833 \
 834 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 835     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 836 }\
 837 \
 838 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 839     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 840 }\
 841 \
 842 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 843                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 844     int i;\
 845     for(i=0; i<h; i++){\
 846         uint32_t a, b, c, d, l0, l1, h0, h1;\
 847         a= LD32(&src1[i*src_stride1]);\
 848         b= LD32(&src2[i*src_stride2]);\
 849         c= LD32(&src3[i*src_stride3]);\
 850         d= LD32(&src4[i*src_stride4]);\
 851         l0=  (a&0x03030303UL)\
 852            + (b&0x03030303UL)\
 853            + 0x02020202UL;\
 854         h0= ((a&0xFCFCFCFCUL)>>2)\
 855           + ((b&0xFCFCFCFCUL)>>2);\
 856         l1=  (c&0x03030303UL)\
 857            + (d&0x03030303UL);\
 858         h1= ((c&0xFCFCFCFCUL)>>2)\
 859           + ((d&0xFCFCFCFCUL)>>2);\
 860         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 861         a= LD32(&src1[i*src_stride1+4]);\
 862         b= LD32(&src2[i*src_stride2+4]);\
 863         c= LD32(&src3[i*src_stride3+4]);\
 864         d= LD32(&src4[i*src_stride4+4]);\
 865         l0=  (a&0x03030303UL)\
 866            + (b&0x03030303UL)\
 867            + 0x02020202UL;\
 868         h0= ((a&0xFCFCFCFCUL)>>2)\
 869           + ((b&0xFCFCFCFCUL)>>2);\
 870         l1=  (c&0x03030303UL)\
 871            + (d&0x03030303UL);\
 872         h1= ((c&0xFCFCFCFCUL)>>2)\
 873           + ((d&0xFCFCFCFCUL)>>2);\
 874         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 875     }\
 876 }\
 877 \
 878 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 879     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 880 }\
 881 \
 882 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 883     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 884 }\
 885 \
 886 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 887     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 888 }\
 889 \
 890 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 891     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 892 }\
 893 \
 894 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 895                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 896     int i;\
 897     for(i=0; i<h; i++){\
 898         uint32_t a, b, c, d, l0, l1, h0, h1;\
 899         a= LD32(&src1[i*src_stride1]);\
 900         b= LD32(&src2[i*src_stride2]);\
 901         c= LD32(&src3[i*src_stride3]);\
 902         d= LD32(&src4[i*src_stride4]);\
 903         l0=  (a&0x03030303UL)\
 904            + (b&0x03030303UL)\
 905            + 0x01010101UL;\
 906         h0= ((a&0xFCFCFCFCUL)>>2)\
 907           + ((b&0xFCFCFCFCUL)>>2);\
 908         l1=  (c&0x03030303UL)\
 909            + (d&0x03030303UL);\
 910         h1= ((c&0xFCFCFCFCUL)>>2)\
 911           + ((d&0xFCFCFCFCUL)>>2);\
 912         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 913         a= LD32(&src1[i*src_stride1+4]);\
 914         b= LD32(&src2[i*src_stride2+4]);\
 915         c= LD32(&src3[i*src_stride3+4]);\
 916         d= LD32(&src4[i*src_stride4+4]);\
 917         l0=  (a&0x03030303UL)\
 918            + (b&0x03030303UL)\
 919            + 0x01010101UL;\
 920         h0= ((a&0xFCFCFCFCUL)>>2)\
 921           + ((b&0xFCFCFCFCUL)>>2);\
 922         l1=  (c&0x03030303UL)\
 923            + (d&0x03030303UL);\
 924         h1= ((c&0xFCFCFCFCUL)>>2)\
 925           + ((d&0xFCFCFCFCUL)>>2);\
 926         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 927     }\
 928 }\
 929 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 930                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 931     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 932     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 933 }\
 934 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 935                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 936     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 937     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
 938 }\
 939 \
 940 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 941 {\
 942         int i, a0, b0, a1, b1;\
 943         a0= pixels[0];\
 944         b0= pixels[1] + 2;\
 945         a0 += b0;\
 946         b0 += pixels[2];\
 947 \
 948         pixels+=line_size;\
 949         for(i=0; i<h; i+=2){\
 950             a1= pixels[0];\
 951             b1= pixels[1];\
 952             a1 += b1;\
 953             b1 += pixels[2];\
 954 \
 955             block[0]= (a1+a0)>>2; /* FIXME non put */\
 956             block[1]= (b1+b0)>>2;\
 957 \
 958             pixels+=line_size;\
 959             block +=line_size;\
 960 \
 961             a0= pixels[0];\
 962             b0= pixels[1] + 2;\
 963             a0 += b0;\
 964             b0 += pixels[2];\
 965 \
 966             block[0]= (a1+a0)>>2;\
 967             block[1]= (b1+b0)>>2;\
 968             pixels+=line_size;\
 969             block +=line_size;\
 970         }\
 971 }\
 972 \
 973 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 974 {\
 975         int i;\
 976         const uint32_t a= LD32(pixels  );\
 977         const uint32_t b= LD32(pixels+1);\
 978         uint32_t l0=  (a&0x03030303UL)\
 979                     + (b&0x03030303UL)\
 980                     + 0x02020202UL;\
 981         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
 982                    + ((b&0xFCFCFCFCUL)>>2);\
 983         uint32_t l1,h1;\
 984 \
 985         pixels+=line_size;\
 986         for(i=0; i<h; i+=2){\
 987             uint32_t a= LD32(pixels  );\
 988             uint32_t b= LD32(pixels+1);\
 989             l1=  (a&0x03030303UL)\
 990                + (b&0x03030303UL);\
 991             h1= ((a&0xFCFCFCFCUL)>>2)\
 992               + ((b&0xFCFCFCFCUL)>>2);\
 993             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 994             pixels+=line_size;\
 995             block +=line_size;\
 996             a= LD32(pixels  );\
 997             b= LD32(pixels+1);\
 998             l0=  (a&0x03030303UL)\
 999                + (b&0x03030303UL)\
1000                + 0x02020202UL;\
1001             h0= ((a&0xFCFCFCFCUL)>>2)\
1002               + ((b&0xFCFCFCFCUL)>>2);\
1003             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004             pixels+=line_size;\
1005             block +=line_size;\
1006         }\
1007 }\
1008 \
1009 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010 {\
1011     int j;\
1012     for(j=0; j<2; j++){\
1013         int i;\
1014         const uint32_t a= LD32(pixels  );\
1015         const uint32_t b= LD32(pixels+1);\
1016         uint32_t l0=  (a&0x03030303UL)\
1017                     + (b&0x03030303UL)\
1018                     + 0x02020202UL;\
1019         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020                    + ((b&0xFCFCFCFCUL)>>2);\
1021         uint32_t l1,h1;\
1022 \
1023         pixels+=line_size;\
1024         for(i=0; i<h; i+=2){\
1025             uint32_t a= LD32(pixels  );\
1026             uint32_t b= LD32(pixels+1);\
1027             l1=  (a&0x03030303UL)\
1028                + (b&0x03030303UL);\
1029             h1= ((a&0xFCFCFCFCUL)>>2)\
1030               + ((b&0xFCFCFCFCUL)>>2);\
1031             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032             pixels+=line_size;\
1033             block +=line_size;\
1034             a= LD32(pixels  );\
1035             b= LD32(pixels+1);\
1036             l0=  (a&0x03030303UL)\
1037                + (b&0x03030303UL)\
1038                + 0x02020202UL;\
1039             h0= ((a&0xFCFCFCFCUL)>>2)\
1040               + ((b&0xFCFCFCFCUL)>>2);\
1041             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1042             pixels+=line_size;\
1043             block +=line_size;\
1044         }\
1045         pixels+=4-line_size*(h+1);\
1046         block +=4-line_size*h;\
1047     }\
1048 }\
1049 \
1050 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051 {\
1052     int j;\
1053     for(j=0; j<2; j++){\
1054         int i;\
1055         const uint32_t a= LD32(pixels  );\
1056         const uint32_t b= LD32(pixels+1);\
1057         uint32_t l0=  (a&0x03030303UL)\
1058                     + (b&0x03030303UL)\
1059                     + 0x01010101UL;\
1060         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061                    + ((b&0xFCFCFCFCUL)>>2);\
1062         uint32_t l1,h1;\
1063 \
1064         pixels+=line_size;\
1065         for(i=0; i<h; i+=2){\
1066             uint32_t a= LD32(pixels  );\
1067             uint32_t b= LD32(pixels+1);\
1068             l1=  (a&0x03030303UL)\
1069                + (b&0x03030303UL);\
1070             h1= ((a&0xFCFCFCFCUL)>>2)\
1071               + ((b&0xFCFCFCFCUL)>>2);\
1072             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073             pixels+=line_size;\
1074             block +=line_size;\
1075             a= LD32(pixels  );\
1076             b= LD32(pixels+1);\
1077             l0=  (a&0x03030303UL)\
1078                + (b&0x03030303UL)\
1079                + 0x01010101UL;\
1080             h0= ((a&0xFCFCFCFCUL)>>2)\
1081               + ((b&0xFCFCFCFCUL)>>2);\
1082             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083             pixels+=line_size;\
1084             block +=line_size;\
1085         }\
1086         pixels+=4-line_size*(h+1);\
1087         block +=4-line_size*h;\
1088     }\
1089 }\
1090 \
1091 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1092 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1099
1100 #define op_avg(a, b) a = rnd_avg32(a, b)
1101 #endif
1102 #define op_put(a, b) a = b
1103
1104 PIXOP2(avg, op_avg)
1105 PIXOP2(put, op_put)
1106 #undef op_avg
1107 #undef op_put
1108
1109 #define avg2(a,b) ((a+b+1)>>1)
1110 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1111
1112 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1114 }
1115
1116 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1118 }
1119
1120 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1121 {
1122     const int A=(16-x16)*(16-y16);
1123     const int B=(   x16)*(16-y16);
1124     const int C=(16-x16)*(   y16);
1125     const int D=(   x16)*(   y16);
1126     int i;
1127
1128     for(i=0; i<h; i++)
1129     {
1130         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1138         dst+= stride;
1139         src+= stride;
1140     }
1141 }
1142
1143 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1144                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1145 {
1146     int y, vx, vy;
1147     const int s= 1<<shift;
1148
1149     width--;
1150     height--;
1151
1152     for(y=0; y<h; y++){
1153         int x;
1154
1155         vx= ox;
1156         vy= oy;
1157         for(x=0; x<8; x++){ //XXX FIXME optimize
1158             int src_x, src_y, frac_x, frac_y, index;
1159
1160             src_x= vx>>16;
1161             src_y= vy>>16;
1162             frac_x= src_x&(s-1);
1163             frac_y= src_y&(s-1);
1164             src_x>>=shift;
1165             src_y>>=shift;
1166
1167             if((unsigned)src_x < width){
1168                 if((unsigned)src_y < height){
1169                     index= src_x + src_y*stride;
1170                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1171                                            + src[index       +1]*   frac_x )*(s-frac_y)
1172                                         + (  src[index+stride  ]*(s-frac_x)
1173                                            + src[index+stride+1]*   frac_x )*   frac_y
1174                                         + r)>>(shift*2);
1175                 }else{
1176                     index= src_x + clip(src_y, 0, height)*stride;
1177                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1178                                           + src[index       +1]*   frac_x )*s
1179                                         + r)>>(shift*2);
1180                 }
1181             }else{
1182                 if((unsigned)src_y < height){
1183                     index= clip(src_x, 0, width) + src_y*stride;
1184                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1185                                            + src[index+stride  ]*   frac_y )*s
1186                                         + r)>>(shift*2);
1187                 }else{
1188                     index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189                     dst[y*stride + x]=    src[index         ];
1190                 }
1191             }
1192
1193             vx+= dxx;
1194             vy+= dyx;
1195         }
1196         ox += dxy;
1197         oy += dyy;
1198     }
1199 }
1200
1201 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202     switch(width){
1203     case 2: put_pixels2_c (dst, src, stride, height); break;
1204     case 4: put_pixels4_c (dst, src, stride, height); break;
1205     case 8: put_pixels8_c (dst, src, stride, height); break;
1206     case 16:put_pixels16_c(dst, src, stride, height); break;
1207     }
1208 }
1209
1210 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211     int i,j;
1212     for (i=0; i < height; i++) {
1213       for (j=0; j < width; j++) {
1214         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1215       }
1216       src += stride;
1217       dst += stride;
1218     }
1219 }
1220
1221 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222     int i,j;
1223     for (i=0; i < height; i++) {
1224       for (j=0; j < width; j++) {
1225         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1226       }
1227       src += stride;
1228       dst += stride;
1229     }
1230 }
1231
1232 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233     int i,j;
1234     for (i=0; i < height; i++) {
1235       for (j=0; j < width; j++) {
1236         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1237       }
1238       src += stride;
1239       dst += stride;
1240     }
1241 }
1242
1243 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244     int i,j;
1245     for (i=0; i < height; i++) {
1246       for (j=0; j < width; j++) {
1247         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1248       }
1249       src += stride;
1250       dst += stride;
1251     }
1252 }
1253
1254 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255     int i,j;
1256     for (i=0; i < height; i++) {
1257       for (j=0; j < width; j++) {
1258         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259       }
1260       src += stride;
1261       dst += stride;
1262     }
1263 }
1264
1265 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266     int i,j;
1267     for (i=0; i < height; i++) {
1268       for (j=0; j < width; j++) {
1269         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1270       }
1271       src += stride;
1272       dst += stride;
1273     }
1274 }
1275
1276 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277     int i,j;
1278     for (i=0; i < height; i++) {
1279       for (j=0; j < width; j++) {
1280         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1281       }
1282       src += stride;
1283       dst += stride;
1284     }
1285 }
1286
1287 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288     int i,j;
1289     for (i=0; i < height; i++) {
1290       for (j=0; j < width; j++) {
1291         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1292       }
1293       src += stride;
1294       dst += stride;
1295     }
1296 }
1297
1298 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299     switch(width){
1300     case 2: avg_pixels2_c (dst, src, stride, height); break;
1301     case 4: avg_pixels4_c (dst, src, stride, height); break;
1302     case 8: avg_pixels8_c (dst, src, stride, height); break;
1303     case 16:avg_pixels16_c(dst, src, stride, height); break;
1304     }
1305 }
1306
1307 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308     int i,j;
1309     for (i=0; i < height; i++) {
1310       for (j=0; j < width; j++) {
1311         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1312       }
1313       src += stride;
1314       dst += stride;
1315     }
1316 }
1317
1318 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319     int i,j;
1320     for (i=0; i < height; i++) {
1321       for (j=0; j < width; j++) {
1322         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1323       }
1324       src += stride;
1325       dst += stride;
1326     }
1327 }
1328
1329 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330     int i,j;
1331     for (i=0; i < height; i++) {
1332       for (j=0; j < width; j++) {
1333         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1334       }
1335       src += stride;
1336       dst += stride;
1337     }
1338 }
1339
1340 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341     int i,j;
1342     for (i=0; i < height; i++) {
1343       for (j=0; j < width; j++) {
1344         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1345       }
1346       src += stride;
1347       dst += stride;
1348     }
1349 }
1350
1351 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352     int i,j;
1353     for (i=0; i < height; i++) {
1354       for (j=0; j < width; j++) {
1355         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356       }
1357       src += stride;
1358       dst += stride;
1359     }
1360 }
1361
1362 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363     int i,j;
1364     for (i=0; i < height; i++) {
1365       for (j=0; j < width; j++) {
1366         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1367       }
1368       src += stride;
1369       dst += stride;
1370     }
1371 }
1372
1373 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374     int i,j;
1375     for (i=0; i < height; i++) {
1376       for (j=0; j < width; j++) {
1377         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1378       }
1379       src += stride;
1380       dst += stride;
1381     }
1382 }
1383
1384 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385     int i,j;
1386     for (i=0; i < height; i++) {
1387       for (j=0; j < width; j++) {
1388         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389       }
1390       src += stride;
1391       dst += stride;
1392     }
1393 }
1394 #if 0
1395 #define TPEL_WIDTH(width)\
1396 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1414 #endif
1415
1416 #define H264_CHROMA_MC(OPNAME, OP)\
1417 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418     const int A=(8-x)*(8-y);\
1419     const int B=(  x)*(8-y);\
1420     const int C=(8-x)*(  y);\
1421     const int D=(  x)*(  y);\
1422     int i;\
1423     \
1424     assert(x<8 && y<8 && x>=0 && y>=0);\
1425 \
1426     for(i=0; i<h; i++)\
1427     {\
1428         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1430         dst+= stride;\
1431         src+= stride;\
1432     }\
1433 }\
1434 \
1435 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436     const int A=(8-x)*(8-y);\
1437     const int B=(  x)*(8-y);\
1438     const int C=(8-x)*(  y);\
1439     const int D=(  x)*(  y);\
1440     int i;\
1441     \
1442     assert(x<8 && y<8 && x>=0 && y>=0);\
1443 \
1444     for(i=0; i<h; i++)\
1445     {\
1446         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1450         dst+= stride;\
1451         src+= stride;\
1452     }\
1453 }\
1454 \
1455 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456     const int A=(8-x)*(8-y);\
1457     const int B=(  x)*(8-y);\
1458     const int C=(8-x)*(  y);\
1459     const int D=(  x)*(  y);\
1460     int i;\
1461     \
1462     assert(x<8 && y<8 && x>=0 && y>=0);\
1463 \
1464     for(i=0; i<h; i++)\
1465     {\
1466         OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467         OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468         OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469         OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470         OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471         OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472         OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473         OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1474         dst+= stride;\
1475         src+= stride;\
1476     }\
1477 }
1478
1479 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480 #define op_put(a, b) a = (((b) + 32)>>6)
1481
1482 H264_CHROMA_MC(put_       , op_put)
1483 H264_CHROMA_MC(avg_       , op_avg)
1484 #undef op_avg
1485 #undef op_put
1486
1487 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1488 {
1489     int i;
1490     for(i=0; i<h; i++)
1491     {
1492         ST16(dst   , LD16(src   ));
1493         dst+=dstStride;
1494         src+=srcStride;
1495     }
1496 }
1497
1498 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1499 {
1500     int i;
1501     for(i=0; i<h; i++)
1502     {
1503         ST32(dst   , LD32(src   ));
1504         dst+=dstStride;
1505         src+=srcStride;
1506     }
1507 }
1508
1509 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510 {
1511     int i;
1512     for(i=0; i<h; i++)
1513     {
1514         ST32(dst   , LD32(src   ));
1515         ST32(dst+4 , LD32(src+4 ));
1516         dst+=dstStride;
1517         src+=srcStride;
1518     }
1519 }
1520
1521 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1522 {
1523     int i;
1524     for(i=0; i<h; i++)
1525     {
1526         ST32(dst   , LD32(src   ));
1527         ST32(dst+4 , LD32(src+4 ));
1528         ST32(dst+8 , LD32(src+8 ));
1529         ST32(dst+12, LD32(src+12));
1530         dst+=dstStride;
1531         src+=srcStride;
1532     }
1533 }
1534
1535 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1536 {
1537     int i;
1538     for(i=0; i<h; i++)
1539     {
1540         ST32(dst   , LD32(src   ));
1541         ST32(dst+4 , LD32(src+4 ));
1542         ST32(dst+8 , LD32(src+8 ));
1543         ST32(dst+12, LD32(src+12));
1544         dst[16]= src[16];
1545         dst+=dstStride;
1546         src+=srcStride;
1547     }
1548 }
1549
1550 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1551 {
1552     int i;
1553     for(i=0; i<h; i++)
1554     {
1555         ST32(dst   , LD32(src   ));
1556         ST32(dst+4 , LD32(src+4 ));
1557         dst[8]= src[8];
1558         dst+=dstStride;
1559         src+=srcStride;
1560     }
1561 }
1562
1563
1564 #define QPEL_MC(r, OPNAME, RND, OP) \
1565 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1567     int i;\
1568     for(i=0; i<h; i++)\
1569     {\
1570         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1578         dst+=dstStride;\
1579         src+=srcStride;\
1580     }\
1581 }\
1582 \
1583 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1584     const int w=8;\
1585     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1586     int i;\
1587     for(i=0; i<w; i++)\
1588     {\
1589         const int src0= src[0*srcStride];\
1590         const int src1= src[1*srcStride];\
1591         const int src2= src[2*srcStride];\
1592         const int src3= src[3*srcStride];\
1593         const int src4= src[4*srcStride];\
1594         const int src5= src[5*srcStride];\
1595         const int src6= src[6*srcStride];\
1596         const int src7= src[7*srcStride];\
1597         const int src8= src[8*srcStride];\
1598         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1606         dst++;\
1607         src++;\
1608     }\
1609 }\
1610 \
1611 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1613     int i;\
1614     \
1615     for(i=0; i<h; i++)\
1616     {\
1617         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1633         dst+=dstStride;\
1634         src+=srcStride;\
1635     }\
1636 }\
1637 \
1638 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1640     int i;\
1641     const int w=16;\
1642     for(i=0; i<w; i++)\
1643     {\
1644         const int src0= src[0*srcStride];\
1645         const int src1= src[1*srcStride];\
1646         const int src2= src[2*srcStride];\
1647         const int src3= src[3*srcStride];\
1648         const int src4= src[4*srcStride];\
1649         const int src5= src[5*srcStride];\
1650         const int src6= src[6*srcStride];\
1651         const int src7= src[7*srcStride];\
1652         const int src8= src[8*srcStride];\
1653         const int src9= src[9*srcStride];\
1654         const int src10= src[10*srcStride];\
1655         const int src11= src[11*srcStride];\
1656         const int src12= src[12*srcStride];\
1657         const int src13= src[13*srcStride];\
1658         const int src14= src[14*srcStride];\
1659         const int src15= src[15*srcStride];\
1660         const int src16= src[16*srcStride];\
1661         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1677         dst++;\
1678         src++;\
1679     }\
1680 }\
1681 \
1682 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1683     OPNAME ## pixels8_c(dst, src, stride, 8);\
1684 }\
1685 \
1686 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1687     uint8_t half[64];\
1688     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1690 }\
1691 \
1692 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1693     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1694 }\
1695 \
1696 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1697     uint8_t half[64];\
1698     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1700 }\
1701 \
1702 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703     uint8_t full[16*9];\
1704     uint8_t half[64];\
1705     copy_block9(full, src, 16, stride, 9);\
1706     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1708 }\
1709 \
1710 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711     uint8_t full[16*9];\
1712     copy_block9(full, src, 16, stride, 9);\
1713     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1714 }\
1715 \
1716 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717     uint8_t full[16*9];\
1718     uint8_t half[64];\
1719     copy_block9(full, src, 16, stride, 9);\
1720     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1722 }\
1723 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724     uint8_t full[16*9];\
1725     uint8_t halfH[72];\
1726     uint8_t halfV[64];\
1727     uint8_t halfHV[64];\
1728     copy_block9(full, src, 16, stride, 9);\
1729     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1733 }\
1734 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735     uint8_t full[16*9];\
1736     uint8_t halfH[72];\
1737     uint8_t halfHV[64];\
1738     copy_block9(full, src, 16, stride, 9);\
1739     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1743 }\
1744 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745     uint8_t full[16*9];\
1746     uint8_t halfH[72];\
1747     uint8_t halfV[64];\
1748     uint8_t halfHV[64];\
1749     copy_block9(full, src, 16, stride, 9);\
1750     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1754 }\
1755 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756     uint8_t full[16*9];\
1757     uint8_t halfH[72];\
1758     uint8_t halfHV[64];\
1759     copy_block9(full, src, 16, stride, 9);\
1760     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1764 }\
1765 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766     uint8_t full[16*9];\
1767     uint8_t halfH[72];\
1768     uint8_t halfV[64];\
1769     uint8_t halfHV[64];\
1770     copy_block9(full, src, 16, stride, 9);\
1771     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775 }\
1776 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777     uint8_t full[16*9];\
1778     uint8_t halfH[72];\
1779     uint8_t halfHV[64];\
1780     copy_block9(full, src, 16, stride, 9);\
1781     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785 }\
1786 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787     uint8_t full[16*9];\
1788     uint8_t halfH[72];\
1789     uint8_t halfV[64];\
1790     uint8_t halfHV[64];\
1791     copy_block9(full, src, 16, stride, 9);\
1792     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1793     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796 }\
1797 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798     uint8_t full[16*9];\
1799     uint8_t halfH[72];\
1800     uint8_t halfHV[64];\
1801     copy_block9(full, src, 16, stride, 9);\
1802     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1806 }\
1807 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1808     uint8_t halfH[72];\
1809     uint8_t halfHV[64];\
1810     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1813 }\
1814 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1815     uint8_t halfH[72];\
1816     uint8_t halfHV[64];\
1817     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1820 }\
1821 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822     uint8_t full[16*9];\
1823     uint8_t halfH[72];\
1824     uint8_t halfV[64];\
1825     uint8_t halfHV[64];\
1826     copy_block9(full, src, 16, stride, 9);\
1827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1831 }\
1832 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t halfH[72];\
1835     copy_block9(full, src, 16, stride, 9);\
1836     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1839 }\
1840 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841     uint8_t full[16*9];\
1842     uint8_t halfH[72];\
1843     uint8_t halfV[64];\
1844     uint8_t halfHV[64];\
1845     copy_block9(full, src, 16, stride, 9);\
1846     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1850 }\
1851 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t full[16*9];\
1853     uint8_t halfH[72];\
1854     copy_block9(full, src, 16, stride, 9);\
1855     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1858 }\
1859 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t halfH[72];\
1861     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863 }\
1864 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1865     OPNAME ## pixels16_c(dst, src, stride, 16);\
1866 }\
1867 \
1868 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1869     uint8_t half[256];\
1870     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1872 }\
1873 \
1874 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1875     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1876 }\
1877 \
1878 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1879     uint8_t half[256];\
1880     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1882 }\
1883 \
1884 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885     uint8_t full[24*17];\
1886     uint8_t half[256];\
1887     copy_block17(full, src, 24, stride, 17);\
1888     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1890 }\
1891 \
1892 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[24*17];\
1894     copy_block17(full, src, 24, stride, 17);\
1895     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1896 }\
1897 \
1898 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899     uint8_t full[24*17];\
1900     uint8_t half[256];\
1901     copy_block17(full, src, 24, stride, 17);\
1902     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1904 }\
1905 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906     uint8_t full[24*17];\
1907     uint8_t halfH[272];\
1908     uint8_t halfV[256];\
1909     uint8_t halfHV[256];\
1910     copy_block17(full, src, 24, stride, 17);\
1911     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1915 }\
1916 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917     uint8_t full[24*17];\
1918     uint8_t halfH[272];\
1919     uint8_t halfHV[256];\
1920     copy_block17(full, src, 24, stride, 17);\
1921     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1925 }\
1926 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927     uint8_t full[24*17];\
1928     uint8_t halfH[272];\
1929     uint8_t halfV[256];\
1930     uint8_t halfHV[256];\
1931     copy_block17(full, src, 24, stride, 17);\
1932     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1936 }\
1937 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[24*17];\
1939     uint8_t halfH[272];\
1940     uint8_t halfHV[256];\
1941     copy_block17(full, src, 24, stride, 17);\
1942     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1946 }\
1947 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t full[24*17];\
1949     uint8_t halfH[272];\
1950     uint8_t halfV[256];\
1951     uint8_t halfHV[256];\
1952     copy_block17(full, src, 24, stride, 17);\
1953     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1957 }\
1958 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959     uint8_t full[24*17];\
1960     uint8_t halfH[272];\
1961     uint8_t halfHV[256];\
1962     copy_block17(full, src, 24, stride, 17);\
1963     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967 }\
1968 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969     uint8_t full[24*17];\
1970     uint8_t halfH[272];\
1971     uint8_t halfV[256];\
1972     uint8_t halfHV[256];\
1973     copy_block17(full, src, 24, stride, 17);\
1974     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
1975     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1978 }\
1979 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980     uint8_t full[24*17];\
1981     uint8_t halfH[272];\
1982     uint8_t halfHV[256];\
1983     copy_block17(full, src, 24, stride, 17);\
1984     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1988 }\
1989 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990     uint8_t halfH[272];\
1991     uint8_t halfHV[256];\
1992     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1995 }\
1996 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997     uint8_t halfH[272];\
1998     uint8_t halfHV[256];\
1999     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2002 }\
2003 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004     uint8_t full[24*17];\
2005     uint8_t halfH[272];\
2006     uint8_t halfV[256];\
2007     uint8_t halfHV[256];\
2008     copy_block17(full, src, 24, stride, 17);\
2009     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2013 }\
2014 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t halfH[272];\
2017     copy_block17(full, src, 24, stride, 17);\
2018     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2021 }\
2022 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023     uint8_t full[24*17];\
2024     uint8_t halfH[272];\
2025     uint8_t halfV[256];\
2026     uint8_t halfHV[256];\
2027     copy_block17(full, src, 24, stride, 17);\
2028     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2032 }\
2033 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[24*17];\
2035     uint8_t halfH[272];\
2036     copy_block17(full, src, 24, stride, 17);\
2037     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2040 }\
2041 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t halfH[272];\
2043     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045 }
2046
2047 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049 #define op_put(a, b) a = cm[((b) + 16)>>5]
2050 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2051
2052 QPEL_MC(0, put_       , _       , op_put)
2053 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054 QPEL_MC(0, avg_       , _       , op_avg)
2055 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2056 #undef op_avg
2057 #undef op_avg_no_rnd
2058 #undef op_put
2059 #undef op_put_no_rnd
2060
2061 #if 1
2062 #define H264_LOWPASS(OPNAME, OP, OP2) \
2063 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064     const int h=2;\
2065     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2066     int i;\
2067     for(i=0; i<h; i++)\
2068     {\
2069         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2071         dst+=dstStride;\
2072         src+=srcStride;\
2073     }\
2074 }\
2075 \
2076 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2077     const int w=2;\
2078     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2079     int i;\
2080     for(i=0; i<w; i++)\
2081     {\
2082         const int srcB= src[-2*srcStride];\
2083         const int srcA= src[-1*srcStride];\
2084         const int src0= src[0 *srcStride];\
2085         const int src1= src[1 *srcStride];\
2086         const int src2= src[2 *srcStride];\
2087         const int src3= src[3 *srcStride];\
2088         const int src4= src[4 *srcStride];\
2089         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2091         dst++;\
2092         src++;\
2093     }\
2094 }\
2095 \
2096 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2097     const int h=2;\
2098     const int w=2;\
2099     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2100     int i;\
2101     src -= 2*srcStride;\
2102     for(i=0; i<h+5; i++)\
2103     {\
2104         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2106         tmp+=tmpStride;\
2107         src+=srcStride;\
2108     }\
2109     tmp -= tmpStride*(h+5-2);\
2110     for(i=0; i<w; i++)\
2111     {\
2112         const int tmpB= tmp[-2*tmpStride];\
2113         const int tmpA= tmp[-1*tmpStride];\
2114         const int tmp0= tmp[0 *tmpStride];\
2115         const int tmp1= tmp[1 *tmpStride];\
2116         const int tmp2= tmp[2 *tmpStride];\
2117         const int tmp3= tmp[3 *tmpStride];\
2118         const int tmp4= tmp[4 *tmpStride];\
2119         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2121         dst++;\
2122         tmp++;\
2123     }\
2124 }\
2125 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2126     const int h=4;\
2127     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128     int i;\
2129     for(i=0; i<h; i++)\
2130     {\
2131         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2135         dst+=dstStride;\
2136         src+=srcStride;\
2137     }\
2138 }\
2139 \
2140 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2141     const int w=4;\
2142     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2143     int i;\
2144     for(i=0; i<w; i++)\
2145     {\
2146         const int srcB= src[-2*srcStride];\
2147         const int srcA= src[-1*srcStride];\
2148         const int src0= src[0 *srcStride];\
2149         const int src1= src[1 *srcStride];\
2150         const int src2= src[2 *srcStride];\
2151         const int src3= src[3 *srcStride];\
2152         const int src4= src[4 *srcStride];\
2153         const int src5= src[5 *srcStride];\
2154         const int src6= src[6 *srcStride];\
2155         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2159         dst++;\
2160         src++;\
2161     }\
2162 }\
2163 \
2164 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2165     const int h=4;\
2166     const int w=4;\
2167     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2168     int i;\
2169     src -= 2*srcStride;\
2170     for(i=0; i<h+5; i++)\
2171     {\
2172         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2176         tmp+=tmpStride;\
2177         src+=srcStride;\
2178     }\
2179     tmp -= tmpStride*(h+5-2);\
2180     for(i=0; i<w; i++)\
2181     {\
2182         const int tmpB= tmp[-2*tmpStride];\
2183         const int tmpA= tmp[-1*tmpStride];\
2184         const int tmp0= tmp[0 *tmpStride];\
2185         const int tmp1= tmp[1 *tmpStride];\
2186         const int tmp2= tmp[2 *tmpStride];\
2187         const int tmp3= tmp[3 *tmpStride];\
2188         const int tmp4= tmp[4 *tmpStride];\
2189         const int tmp5= tmp[5 *tmpStride];\
2190         const int tmp6= tmp[6 *tmpStride];\
2191         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2195         dst++;\
2196         tmp++;\
2197     }\
2198 }\
2199 \
2200 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2201     const int h=8;\
2202     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2203     int i;\
2204     for(i=0; i<h; i++)\
2205     {\
2206         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2214         dst+=dstStride;\
2215         src+=srcStride;\
2216     }\
2217 }\
2218 \
2219 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220     const int w=8;\
2221     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2222     int i;\
2223     for(i=0; i<w; i++)\
2224     {\
2225         const int srcB= src[-2*srcStride];\
2226         const int srcA= src[-1*srcStride];\
2227         const int src0= src[0 *srcStride];\
2228         const int src1= src[1 *srcStride];\
2229         const int src2= src[2 *srcStride];\
2230         const int src3= src[3 *srcStride];\
2231         const int src4= src[4 *srcStride];\
2232         const int src5= src[5 *srcStride];\
2233         const int src6= src[6 *srcStride];\
2234         const int src7= src[7 *srcStride];\
2235         const int src8= src[8 *srcStride];\
2236         const int src9= src[9 *srcStride];\
2237         const int src10=src[10*srcStride];\
2238         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2246         dst++;\
2247         src++;\
2248     }\
2249 }\
2250 \
2251 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2252     const int h=8;\
2253     const int w=8;\
2254     uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2255     int i;\
2256     src -= 2*srcStride;\
2257     for(i=0; i<h+5; i++)\
2258     {\
2259         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2267         tmp+=tmpStride;\
2268         src+=srcStride;\
2269     }\
2270     tmp -= tmpStride*(h+5-2);\
2271     for(i=0; i<w; i++)\
2272     {\
2273         const int tmpB= tmp[-2*tmpStride];\
2274         const int tmpA= tmp[-1*tmpStride];\
2275         const int tmp0= tmp[0 *tmpStride];\
2276         const int tmp1= tmp[1 *tmpStride];\
2277         const int tmp2= tmp[2 *tmpStride];\
2278         const int tmp3= tmp[3 *tmpStride];\
2279         const int tmp4= tmp[4 *tmpStride];\
2280         const int tmp5= tmp[5 *tmpStride];\
2281         const int tmp6= tmp[6 *tmpStride];\
2282         const int tmp7= tmp[7 *tmpStride];\
2283         const int tmp8= tmp[8 *tmpStride];\
2284         const int tmp9= tmp[9 *tmpStride];\
2285         const int tmp10=tmp[10*tmpStride];\
2286         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2294         dst++;\
2295         tmp++;\
2296     }\
2297 }\
2298 \
2299 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2301     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302     src += 8*srcStride;\
2303     dst += 8*dstStride;\
2304     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2305     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306 }\
2307 \
2308 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2310     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311     src += 8*srcStride;\
2312     dst += 8*dstStride;\
2313     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2314     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315 }\
2316 \
2317 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2319     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320     src += 8*srcStride;\
2321     dst += 8*dstStride;\
2322     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2323     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324 }\
2325
2326 #define H264_MC(OPNAME, SIZE) \
2327 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2329 }\
2330 \
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332     uint8_t half[SIZE*SIZE];\
2333     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2335 }\
2336 \
2337 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342     uint8_t half[SIZE*SIZE];\
2343     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2345 }\
2346 \
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348     uint8_t full[SIZE*(SIZE+5)];\
2349     uint8_t * const full_mid= full + SIZE*2;\
2350     uint8_t half[SIZE*SIZE];\
2351     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2352     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2354 }\
2355 \
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357     uint8_t full[SIZE*(SIZE+5)];\
2358     uint8_t * const full_mid= full + SIZE*2;\
2359     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2360     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2361 }\
2362 \
2363 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364     uint8_t full[SIZE*(SIZE+5)];\
2365     uint8_t * const full_mid= full + SIZE*2;\
2366     uint8_t half[SIZE*SIZE];\
2367     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2368     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2370 }\
2371 \
2372 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373     uint8_t full[SIZE*(SIZE+5)];\
2374     uint8_t * const full_mid= full + SIZE*2;\
2375     uint8_t halfH[SIZE*SIZE];\
2376     uint8_t halfV[SIZE*SIZE];\
2377     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2379     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2381 }\
2382 \
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384     uint8_t full[SIZE*(SIZE+5)];\
2385     uint8_t * const full_mid= full + SIZE*2;\
2386     uint8_t halfH[SIZE*SIZE];\
2387     uint8_t halfV[SIZE*SIZE];\
2388     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2390     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2392 }\
2393 \
2394 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395     uint8_t full[SIZE*(SIZE+5)];\
2396     uint8_t * const full_mid= full + SIZE*2;\
2397     uint8_t halfH[SIZE*SIZE];\
2398     uint8_t halfV[SIZE*SIZE];\
2399     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2401     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2403 }\
2404 \
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406     uint8_t full[SIZE*(SIZE+5)];\
2407     uint8_t * const full_mid= full + SIZE*2;\
2408     uint8_t halfH[SIZE*SIZE];\
2409     uint8_t halfV[SIZE*SIZE];\
2410     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2412     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2414 }\
2415 \
2416 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417     int16_t tmp[SIZE*(SIZE+5)];\
2418     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2419 }\
2420 \
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422     int16_t tmp[SIZE*(SIZE+5)];\
2423     uint8_t halfH[SIZE*SIZE];\
2424     uint8_t halfHV[SIZE*SIZE];\
2425     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2428 }\
2429 \
2430 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431     int16_t tmp[SIZE*(SIZE+5)];\
2432     uint8_t halfH[SIZE*SIZE];\
2433     uint8_t halfHV[SIZE*SIZE];\
2434     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2437 }\
2438 \
2439 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440     uint8_t full[SIZE*(SIZE+5)];\
2441     uint8_t * const full_mid= full + SIZE*2;\
2442     int16_t tmp[SIZE*(SIZE+5)];\
2443     uint8_t halfV[SIZE*SIZE];\
2444     uint8_t halfHV[SIZE*SIZE];\
2445     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2446     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2449 }\
2450 \
2451 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452     uint8_t full[SIZE*(SIZE+5)];\
2453     uint8_t * const full_mid= full + SIZE*2;\
2454     int16_t tmp[SIZE*(SIZE+5)];\
2455     uint8_t halfV[SIZE*SIZE];\
2456     uint8_t halfHV[SIZE*SIZE];\
2457     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2458     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2461 }\
2462
2463 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2466 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2468
2469 H264_LOWPASS(put_       , op_put, op2_put)
2470 H264_LOWPASS(avg_       , op_avg, op2_avg)
2471 H264_MC(put_, 2)
2472 H264_MC(put_, 4)
2473 H264_MC(put_, 8)
2474 H264_MC(put_, 16)
2475 H264_MC(avg_, 4)
2476 H264_MC(avg_, 8)
2477 H264_MC(avg_, 16)
2478
2479 #undef op_avg
2480 #undef op_put
2481 #undef op2_avg
2482 #undef op2_put
2483 #endif
2484
2485 #define op_scale1(x)  block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486 #define op_scale2(x)  dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2487 #define H264_WEIGHT(W,H) \
2488 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2489     int y; \
2490     offset <<= log2_denom; \
2491     if(log2_denom) offset += 1<<(log2_denom-1); \
2492     for(y=0; y<H; y++, block += stride){ \
2493         op_scale1(0); \
2494         op_scale1(1); \
2495         if(W==2) continue; \
2496         op_scale1(2); \
2497         op_scale1(3); \
2498         if(W==4) continue; \
2499         op_scale1(4); \
2500         op_scale1(5); \
2501         op_scale1(6); \
2502         op_scale1(7); \
2503         if(W==8) continue; \
2504         op_scale1(8); \
2505         op_scale1(9); \
2506         op_scale1(10); \
2507         op_scale1(11); \
2508         op_scale1(12); \
2509         op_scale1(13); \
2510         op_scale1(14); \
2511         op_scale1(15); \
2512     } \
2513 } \
2514 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2515     int y; \
2516     offset = ((offset + 1) | 1) << log2_denom; \
2517     for(y=0; y<H; y++, dst += stride, src += stride){ \
2518         op_scale2(0); \
2519         op_scale2(1); \
2520         if(W==2) continue; \
2521         op_scale2(2); \
2522         op_scale2(3); \
2523         if(W==4) continue; \
2524         op_scale2(4); \
2525         op_scale2(5); \
2526         op_scale2(6); \
2527         op_scale2(7); \
2528         if(W==8) continue; \
2529         op_scale2(8); \
2530         op_scale2(9); \
2531         op_scale2(10); \
2532         op_scale2(11); \
2533         op_scale2(12); \
2534         op_scale2(13); \
2535         op_scale2(14); \
2536         op_scale2(15); \
2537     } \
2538 }
2539
2540 H264_WEIGHT(16,16)
2541 H264_WEIGHT(16,8)
2542 H264_WEIGHT(8,16)
2543 H264_WEIGHT(8,8)
2544 H264_WEIGHT(8,4)
2545 H264_WEIGHT(4,8)
2546 H264_WEIGHT(4,4)
2547 H264_WEIGHT(4,2)
2548 H264_WEIGHT(2,4)
2549 H264_WEIGHT(2,2)
2550
2551 #undef op_scale1
2552 #undef op_scale2
2553 #undef H264_WEIGHT
2554
2555 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2557     int i;
2558
2559     for(i=0; i<h; i++){
2560         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2568         dst+=dstStride;
2569         src+=srcStride;
2570     }
2571 }
2572
2573 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2574     uint8_t *cm = cropTbl + MAX_NEG_CROP;
2575     int i;
2576
2577     for(i=0; i<w; i++){
2578         const int src_1= src[ -srcStride];
2579         const int src0 = src[0          ];
2580         const int src1 = src[  srcStride];
2581         const int src2 = src[2*srcStride];
2582         const int src3 = src[3*srcStride];
2583         const int src4 = src[4*srcStride];
2584         const int src5 = src[5*srcStride];
2585         const int src6 = src[6*srcStride];
2586         const int src7 = src[7*srcStride];
2587         const int src8 = src[8*srcStride];
2588         const int src9 = src[9*srcStride];
2589         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2590         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2591         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2592         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2593         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2594         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2595         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2596         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2597         src++;
2598         dst++;
2599     }
2600 }
2601
2602 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2603     put_pixels8_c(dst, src, stride, 8);
2604 }
2605
2606 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2607     uint8_t half[64];
2608     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2609     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2610 }
2611
2612 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2613     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2614 }
2615
2616 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2617     uint8_t half[64];
2618     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2619     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2620 }
2621
2622 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2623     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2624 }
2625
2626 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2627     uint8_t halfH[88];
2628     uint8_t halfV[64];
2629     uint8_t halfHV[64];
2630     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2631     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2632     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2633     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2634 }
2635 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2636     uint8_t halfH[88];
2637     uint8_t halfV[64];
2638     uint8_t halfHV[64];
2639     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2641     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2642     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2643 }
2644 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2645     uint8_t halfH[88];
2646     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2647     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2648 }
2649
2650 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2651     int x;
2652     const int strength= ff_h263_loop_filter_strength[qscale];
2653
2654     for(x=0; x<8; x++){
2655         int d1, d2, ad1;
2656         int p0= src[x-2*stride];
2657         int p1= src[x-1*stride];
2658         int p2= src[x+0*stride];
2659         int p3= src[x+1*stride];
2660         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2661
2662         if     (d<-2*strength) d1= 0;
2663         else if(d<-  strength) d1=-2*strength - d;
2664         else if(d<   strength) d1= d;
2665         else if(d< 2*strength) d1= 2*strength - d;
2666         else                   d1= 0;
2667
2668         p1 += d1;
2669         p2 -= d1;
2670         if(p1&256) p1= ~(p1>>31);
2671         if(p2&256) p2= ~(p2>>31);
2672
2673         src[x-1*stride] = p1;
2674         src[x+0*stride] = p2;
2675
2676         ad1= ABS(d1)>>1;
2677
2678         d2= clip((p0-p3)/4, -ad1, ad1);
2679
2680         src[x-2*stride] = p0 - d2;
2681         src[x+  stride] = p3 + d2;
2682     }
2683 }
2684
2685 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2686     int y;
2687     const int strength= ff_h263_loop_filter_strength[qscale];
2688
2689     for(y=0; y<8; y++){
2690         int d1, d2, ad1;
2691         int p0= src[y*stride-2];
2692         int p1= src[y*stride-1];
2693         int p2= src[y*stride+0];
2694         int p3= src[y*stride+1];
2695         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2696
2697         if     (d<-2*strength) d1= 0;
2698         else if(d<-  strength) d1=-2*strength - d;
2699         else if(d<   strength) d1= d;
2700         else if(d< 2*strength) d1= 2*strength - d;
2701         else                   d1= 0;
2702
2703         p1 += d1;
2704         p2 -= d1;
2705         if(p1&256) p1= ~(p1>>31);
2706         if(p2&256) p2= ~(p2>>31);
2707
2708         src[y*stride-1] = p1;
2709         src[y*stride+0] = p2;
2710
2711         ad1= ABS(d1)>>1;
2712
2713         d2= clip((p0-p3)/4, -ad1, ad1);
2714
2715         src[y*stride-2] = p0 - d2;
2716         src[y*stride+1] = p3 + d2;
2717     }
2718 }
2719
2720 static void h261_loop_filter_c(uint8_t *src, int stride){
2721     int x,y,xy,yz;
2722     int temp[64];
2723
2724     for(x=0; x<8; x++){
2725         temp[x      ] = 4*src[x           ];
2726         temp[x + 7*8] = 4*src[x + 7*stride];
2727     }
2728     for(y=1; y<7; y++){
2729         for(x=0; x<8; x++){
2730             xy = y * stride + x;
2731             yz = y * 8 + x;
2732             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2733         }
2734     }
2735
2736     for(y=0; y<8; y++){
2737         src[  y*stride] = (temp[  y*8] + 2)>>2;
2738         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2739         for(x=1; x<7; x++){
2740             xy = y * stride + x;
2741             yz = y * 8 + x;
2742             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2743         }
2744     }
2745 }
2746
2747 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2748 {
2749     int i, d;
2750     for( i = 0; i < 4; i++ ) {
2751         if( tc0[i] < 0 ) {
2752             pix += 4*ystride;
2753             continue;
2754         }
2755         for( d = 0; d < 4; d++ ) {
2756             const int p0 = pix[-1*xstride];
2757             const int p1 = pix[-2*xstride];
2758             const int p2 = pix[-3*xstride];
2759             const int q0 = pix[0];
2760             const int q1 = pix[1*xstride];
2761             const int q2 = pix[2*xstride];
2762
2763             if( ABS( p0 - q0 ) < alpha &&
2764                 ABS( p1 - p0 ) < beta &&
2765                 ABS( q1 - q0 ) < beta ) {
2766
2767                 int tc = tc0[i];
2768                 int i_delta;
2769
2770                 if( ABS( p2 - p0 ) < beta ) {
2771                     pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2772                     tc++;
2773                 }
2774                 if( ABS( q2 - q0 ) < beta ) {
2775                     pix[   xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2776                     tc++;
2777                 }
2778
2779                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2780                 pix[-xstride] = clip_uint8( p0 + i_delta );    /* p0' */
2781                 pix[0]        = clip_uint8( q0 - i_delta );    /* q0' */
2782             }
2783             pix += ystride;
2784         }
2785     }
2786 }
2787 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2788 {
2789     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2790 }
2791 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2792 {
2793     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2794 }
2795
2796 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2797 {
2798     int i, d;
2799     for( i = 0; i < 4; i++ ) {
2800         const int tc = tc0[i];
2801         if( tc <= 0 ) {
2802             pix += 2*ystride;
2803             continue;
2804         }
2805         for( d = 0; d < 2; d++ ) {
2806             const int p0 = pix[-1*xstride];
2807             const int p1 = pix[-2*xstride];
2808             const int q0 = pix[0];
2809             const int q1 = pix[1*xstride];
2810
2811             if( ABS( p0 - q0 ) < alpha &&
2812                 ABS( p1 - p0 ) < beta &&
2813                 ABS( q1 - q0 ) < beta ) {
2814
2815                 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2816
2817                 pix[-xstride] = clip_uint8( p0 + delta );    /* p0' */
2818                 pix[0]        = clip_uint8( q0 - delta );    /* q0' */
2819             }
2820             pix += ystride;
2821         }
2822     }
2823 }
2824 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2825 {
2826     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2827 }
2828 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829 {
2830     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2831 }
2832
2833 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2834 {
2835     int d;
2836     for( d = 0; d < 8; d++ ) {
2837         const int p0 = pix[-1*xstride];
2838         const int p1 = pix[-2*xstride];
2839         const int q0 = pix[0];
2840         const int q1 = pix[1*xstride];
2841
2842         if( ABS( p0 - q0 ) < alpha &&
2843             ABS( p1 - p0 ) < beta &&
2844             ABS( q1 - q0 ) < beta ) {
2845
2846             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
2847             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
2848         }
2849         pix += ystride;
2850     }
2851 }
2852 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2853 {
2854     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2855 }
2856 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857 {
2858     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2859 }
2860
2861 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2862 {
2863     int s, i;
2864
2865     s = 0;
2866     for(i=0;i<h;i++) {
2867         s += abs(pix1[0] - pix2[0]);
2868         s += abs(pix1[1] - pix2[1]);
2869         s += abs(pix1[2] - pix2[2]);
2870         s += abs(pix1[3] - pix2[3]);
2871         s += abs(pix1[4] - pix2[4]);
2872         s += abs(pix1[5] - pix2[5]);
2873         s += abs(pix1[6] - pix2[6]);
2874         s += abs(pix1[7] - pix2[7]);
2875         s += abs(pix1[8] - pix2[8]);
2876         s += abs(pix1[9] - pix2[9]);
2877         s += abs(pix1[10] - pix2[10]);
2878         s += abs(pix1[11] - pix2[11]);
2879         s += abs(pix1[12] - pix2[12]);
2880         s += abs(pix1[13] - pix2[13]);
2881         s += abs(pix1[14] - pix2[14]);
2882         s += abs(pix1[15] - pix2[15]);
2883         pix1 += line_size;
2884         pix2 += line_size;
2885     }
2886     return s;
2887 }
2888
2889 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2890 {
2891     int s, i;
2892
2893     s = 0;
2894     for(i=0;i<h;i++) {
2895         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2896         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2897         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2898         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2899         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2900         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2901         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2902         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2903         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2904         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2905         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2906         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2907         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2908         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2909         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2910         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2911         pix1 += line_size;
2912         pix2 += line_size;
2913     }
2914     return s;
2915 }
2916
2917 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2918 {
2919     int s, i;
2920     uint8_t *pix3 = pix2 + line_size;
2921
2922     s = 0;
2923     for(i=0;i<h;i++) {
2924         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2925         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2926         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2927         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2928         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2929         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2930         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2931         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2932         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2933         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2934         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2935         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2936         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2937         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2938         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2939         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2940         pix1 += line_size;
2941         pix2 += line_size;
2942         pix3 += line_size;
2943     }
2944     return s;
2945 }
2946
2947 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2948 {
2949     int s, i;
2950     uint8_t *pix3 = pix2 + line_size;
2951
2952     s = 0;
2953     for(i=0;i<h;i++) {
2954         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2955         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2956         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2957         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2958         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2959         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2960         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2961         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2962         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2963         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2964         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2965         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2966         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2967         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2968         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2969         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2970         pix1 += line_size;
2971         pix2 += line_size;
2972         pix3 += line_size;
2973     }
2974     return s;
2975 }
2976
2977 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2978 {
2979     int s, i;
2980
2981     s = 0;
2982     for(i=0;i<h;i++) {
2983         s += abs(pix1[0] - pix2[0]);
2984         s += abs(pix1[1] - pix2[1]);
2985         s += abs(pix1[2] - pix2[2]);
2986         s += abs(pix1[3] - pix2[3]);
2987         s += abs(pix1[4] - pix2[4]);
2988         s += abs(pix1[5] - pix2[5]);
2989         s += abs(pix1[6] - pix2[6]);
2990         s += abs(pix1[7] - pix2[7]);
2991         pix1 += line_size;
2992         pix2 += line_size;
2993     }
2994     return s;
2995 }
2996
2997 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2998 {
2999     int s, i;
3000
3001     s = 0;
3002     for(i=0;i<h;i++) {
3003         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3004         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3005         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3006         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3007         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3008         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3009         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3010         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3011         pix1 += line_size;
3012         pix2 += line_size;
3013     }
3014     return s;
3015 }
3016
3017 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3018 {
3019     int s, i;
3020     uint8_t *pix3 = pix2 + line_size;
3021
3022     s = 0;
3023     for(i=0;i<h;i++) {
3024         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3025         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3026         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3027         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3028         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3029         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3030         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3031         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3032         pix1 += line_size;
3033         pix2 += line_size;
3034         pix3 += line_size;
3035     }
3036     return s;
3037 }
3038
3039 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3040 {
3041     int s, i;
3042     uint8_t *pix3 = pix2 + line_size;
3043
3044     s = 0;
3045     for(i=0;i<h;i++) {
3046         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3047         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3048         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3049         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3050         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3051         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3052         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3053         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3054         pix1 += line_size;
3055         pix2 += line_size;
3056         pix3 += line_size;
3057     }
3058     return s;
3059 }
3060
3061 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3062     MpegEncContext *c = v;
3063     int score1=0;
3064     int score2=0;
3065     int x,y;
3066
3067     for(y=0; y<h; y++){
3068         for(x=0; x<16; x++){
3069             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3070         }
3071         if(y+1<h){
3072             for(x=0; x<15; x++){
3073                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3074                              - s1[x+1] + s1[x+1+stride])
3075                         -ABS(  s2[x  ] - s2[x  +stride]
3076                              - s2[x+1] + s2[x+1+stride]);
3077             }
3078         }
3079         s1+= stride;
3080         s2+= stride;
3081     }
3082
3083     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3084     else  return score1 + ABS(score2)*8;
3085 }
3086
3087 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3088     MpegEncContext *c = v;
3089     int score1=0;
3090     int score2=0;
3091     int x,y;
3092
3093     for(y=0; y<h; y++){
3094         for(x=0; x<8; x++){
3095             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3096         }
3097         if(y+1<h){
3098             for(x=0; x<7; x++){
3099                 score2+= ABS(  s1[x  ] - s1[x  +stride]
3100                              - s1[x+1] + s1[x+1+stride])
3101                         -ABS(  s2[x  ] - s2[x  +stride]
3102                              - s2[x+1] + s2[x+1+stride]);
3103             }
3104         }
3105         s1+= stride;
3106         s2+= stride;
3107     }
3108
3109     if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3110     else  return score1 + ABS(score2)*8;
3111 }
3112
3113 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3114     int i;
3115     unsigned int sum=0;
3116
3117     for(i=0; i<8*8; i++){
3118         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3119         int w= weight[i];
3120         b>>= RECON_SHIFT;
3121         assert(-512<b && b<512);
3122
3123         sum += (w*b)*(w*b)>>4;
3124     }
3125     return sum>>2;
3126 }
3127
3128 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3129     int i;
3130
3131     for(i=0; i<8*8; i++){
3132         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3133     }
3134 }
3135
3136 /**
3137  * permutes an 8x8 block.
3138  * @param block the block which will be permuted according to the given permutation vector
3139  * @param permutation the permutation vector
3140  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3141  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3142  *                  (inverse) permutated to scantable order!
3143  */
3144 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3145 {
3146     int i;
3147     DCTELEM temp[64];
3148
3149     if(last<=0) return;
3150     //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3151
3152     for(i=0; i<=last; i++){
3153         const int j= scantable[i];
3154         temp[j]= block[j];
3155         block[j]=0;
3156     }
3157
3158     for(i=0; i<=last; i++){
3159         const int j= scantable[i];
3160         const int perm_j= permutation[j];
3161         block[perm_j]= temp[j];
3162     }
3163 }
3164
3165 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3166     return 0;
3167 }
3168
3169 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3170     int i;
3171
3172     memset(cmp, 0, sizeof(void*)*5);
3173
3174     for(i=0; i<5; i++){
3175         switch(type&0xFF){
3176         case FF_CMP_SAD:
3177             cmp[i]= c->sad[i];
3178             break;
3179         case FF_CMP_SATD:
3180             cmp[i]= c->hadamard8_diff[i];
3181             break;
3182         case FF_CMP_SSE:
3183             cmp[i]= c->sse[i];
3184             break;
3185         case FF_CMP_DCT:
3186             cmp[i]= c->dct_sad[i];
3187             break;
3188         case FF_CMP_DCT264:
3189             cmp[i]= c->dct264_sad[i];
3190             break;
3191         case FF_CMP_DCTMAX:
3192             cmp[i]= c->dct_max[i];
3193             break;
3194         case FF_CMP_PSNR:
3195             cmp[i]= c->quant_psnr[i];
3196             break;
3197         case FF_CMP_BIT:
3198             cmp[i]= c->bit[i];
3199             break;
3200         case FF_CMP_RD:
3201             cmp[i]= c->rd[i];
3202             break;
3203         case FF_CMP_VSAD:
3204             cmp[i]= c->vsad[i];
3205             break;
3206         case FF_CMP_VSSE:
3207             cmp[i]= c->vsse[i];
3208             break;
3209         case FF_CMP_ZERO:
3210             cmp[i]= zero_cmp;
3211             break;
3212         case FF_CMP_NSSE:
3213             cmp[i]= c->nsse[i];
3214             break;
3215         case FF_CMP_W53:
3216             cmp[i]= c->w53[i];
3217             break;
3218         case FF_CMP_W97:
3219             cmp[i]= c->w97[i];
3220             break;
3221         default:
3222             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3223         }
3224     }
3225 }
3226
3227 /**
3228  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3229  */
3230 static void clear_blocks_c(DCTELEM *blocks)
3231 {
3232     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3233 }
3234
3235 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3236     int i;
3237     for(i=0; i+7<w; i+=8){
3238         dst[i+0] += src[i+0];
3239         dst[i+1] += src[i+1];
3240         dst[i+2] += src[i+2];
3241         dst[i+3] += src[i+3];
3242         dst[i+4] += src[i+4];
3243         dst[i+5] += src[i+5];
3244         dst[i+6] += src[i+6];
3245         dst[i+7] += src[i+7];
3246     }
3247     for(; i<w; i++)
3248         dst[i+0] += src[i+0];
3249 }
3250
3251 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3252     int i;
3253     for(i=0; i+7<w; i+=8){
3254         dst[i+0] = src1[i+0]-src2[i+0];
3255         dst[i+1] = src1[i+1]-src2[i+1];
3256         dst[i+2] = src1[i+2]-src2[i+2];
3257         dst[i+3] = src1[i+3]-src2[i+3];
3258         dst[i+4] = src1[i+4]-src2[i+4];
3259         dst[i+5] = src1[i+5]-src2[i+5];
3260         dst[i+6] = src1[i+6]-src2[i+6];
3261         dst[i+7] = src1[i+7]-src2[i+7];
3262     }
3263     for(; i<w; i++)
3264         dst[i+0] = src1[i+0]-src2[i+0];
3265 }
3266
3267 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3268     int i;
3269     uint8_t l, lt;
3270
3271     l= *left;
3272     lt= *left_top;
3273
3274     for(i=0; i<w; i++){
3275         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3276         lt= src1[i];
3277         l= src2[i];
3278         dst[i]= l - pred;
3279     }
3280
3281     *left= l;
3282     *left_top= lt;
3283 }
3284
3285 #define BUTTERFLY2(o1,o2,i1,i2) \
3286 o1= (i1)+(i2);\
3287 o2= (i1)-(i2);
3288
3289 #define BUTTERFLY1(x,y) \
3290 {\
3291     int a,b;\
3292     a= x;\
3293     b= y;\
3294     x= a+b;\
3295     y= a-b;\
3296 }
3297
3298 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3299
3300 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3301     int i;
3302     int temp[64];
3303     int sum=0;
3304
3305     assert(h==8);
3306
3307     for(i=0; i<8; i++){
3308         //FIXME try pointer walks
3309         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3310         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3311         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3312         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3313
3314         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3315         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3316         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3317         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3318
3319         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3320         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3321         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3322         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3323     }
3324
3325     for(i=0; i<8; i++){
3326         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3327         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3328         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3329         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3330
3331         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3332         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3333         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3334         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3335
3336         sum +=
3337              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3338             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3339             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3340             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3341     }
3342 #if 0
3343 static int maxi=0;
3344 if(sum>maxi){
3345     maxi=sum;
3346     printf("MAX:%d\n", maxi);
3347 }
3348 #endif
3349     return sum;
3350 }
3351
3352 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3353     int i;
3354     int temp[64];
3355     int sum=0;
3356
3357     assert(h==8);
3358
3359     for(i=0; i<8; i++){
3360         //FIXME try pointer walks
3361         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3362         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3363         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3364         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3365
3366         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3367         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3368         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3369         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3370
3371         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3372         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3373         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3374         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3375     }
3376
3377     for(i=0; i<8; i++){
3378         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3379         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3380         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3381         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3382
3383         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3384         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3385         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3386         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3387
3388         sum +=
3389              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3390             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3391             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3392             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3393     }
3394
3395     sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3396
3397     return sum;
3398 }
3399
3400 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3401     MpegEncContext * const s= (MpegEncContext *)c;
3402     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3403     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3404     int sum=0, i;
3405
3406     assert(h==8);
3407
3408     s->dsp.diff_pixels(temp, src1, src2, stride);
3409     s->dsp.fdct(temp);
3410
3411     for(i=0; i<64; i++)
3412         sum+= ABS(temp[i]);
3413
3414     return sum;
3415 }
3416
3417 #ifdef CONFIG_GPL
3418 #define DCT8_1D {\
3419     const int s07 = SRC(0) + SRC(7);\
3420     const int s16 = SRC(1) + SRC(6);\
3421     const int s25 = SRC(2) + SRC(5);\
3422     const int s34 = SRC(3) + SRC(4);\
3423     const int a0 = s07 + s34;\
3424     const int a1 = s16 + s25;\
3425     const int a2 = s07 - s34;\
3426     const int a3 = s16 - s25;\
3427     const int d07 = SRC(0) - SRC(7);\
3428     const int d16 = SRC(1) - SRC(6);\
3429     const int d25 = SRC(2) - SRC(5);\
3430     const int d34 = SRC(3) - SRC(4);\
3431     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3432     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3433     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3434     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3435     DST(0,  a0 + a1     ) ;\
3436     DST(1,  a4 + (a7>>2)) ;\
3437     DST(2,  a2 + (a3>>1)) ;\
3438     DST(3,  a5 + (a6>>2)) ;\
3439     DST(4,  a0 - a1     ) ;\
3440     DST(5,  a6 - (a5>>2)) ;\
3441     DST(6, (a2>>1) - a3 ) ;\
3442     DST(7, (a4>>2) - a7 ) ;\
3443 }
3444
3445 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3446     MpegEncContext * const s= (MpegEncContext *)c;
3447     int16_t dct[8][8];
3448     int i;
3449     int sum=0;
3450
3451     s->dsp.diff_pixels(dct, src1, src2, stride);
3452
3453 #define SRC(x) dct[i][x]
3454 #define DST(x,v) dct[i][x]= v
3455     for( i = 0; i < 8; i++ )
3456         DCT8_1D
3457 #undef SRC
3458 #undef DST
3459
3460 #define SRC(x) dct[x][i]
3461 #define DST(x,v) sum += ABS(v)
3462     for( i = 0; i < 8; i++ )
3463         DCT8_1D
3464 #undef SRC
3465 #undef DST
3466     return sum;
3467 }
3468 #endif
3469
3470 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3471     MpegEncContext * const s= (MpegEncContext *)c;
3472     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3473     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3474     int sum=0, i;
3475
3476     assert(h==8);
3477
3478     s->dsp.diff_pixels(temp, src1, src2, stride);
3479     s->dsp.fdct(temp);
3480
3481     for(i=0; i<64; i++)
3482         sum= FFMAX(sum, ABS(temp[i]));
3483
3484     return sum;
3485 }
3486
3487 void simple_idct(DCTELEM *block); //FIXME
3488
3489 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3490     MpegEncContext * const s= (MpegEncContext *)c;
3491     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3492     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3493     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3494     int sum=0, i;
3495
3496     assert(h==8);
3497     s->mb_intra=0;
3498
3499     s->dsp.diff_pixels(temp, src1, src2, stride);
3500
3501     memcpy(bak, temp, 64*sizeof(DCTELEM));
3502
3503     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3504     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3505     simple_idct(temp); //FIXME
3506
3507     for(i=0; i<64; i++)
3508         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3509
3510     return sum;
3511 }
3512
3513 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3514     MpegEncContext * const s= (MpegEncContext *)c;
3515     const uint8_t *scantable= s->intra_scantable.permutated;
3516     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3517     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3518     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3519     uint8_t * const bak= (uint8_t*)aligned_bak;
3520     int i, last, run, bits, level, distoration, start_i;
3521     const int esc_length= s->ac_esc_length;
3522     uint8_t * length;
3523     uint8_t * last_length;
3524
3525     assert(h==8);
3526
3527     for(i=0; i<8; i++){
3528         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3529         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3530     }
3531
3532     s->dsp.diff_pixels(temp, src1, src2, stride);
3533
3534     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3535
3536     bits=0;
3537
3538     if (s->mb_intra) {
3539         start_i = 1;
3540         length     = s->intra_ac_vlc_length;
3541         last_length= s->intra_ac_vlc_last_length;
3542         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3543     } else {
3544         start_i = 0;
3545         length     = s->inter_ac_vlc_length;
3546         last_length= s->inter_ac_vlc_last_length;
3547     }
3548
3549     if(last>=start_i){
3550         run=0;
3551         for(i=start_i; i<last; i++){
3552             int j= scantable[i];
3553             level= temp[j];
3554
3555             if(level){
3556                 level+=64;
3557                 if((level&(~127)) == 0){
3558                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3559                 }else
3560                     bits+= esc_length;
3561                 run=0;
3562             }else
3563                 run++;
3564         }
3565         i= scantable[last];
3566
3567         level= temp[i] + 64;
3568
3569         assert(level - 64);
3570
3571         if((level&(~127)) == 0){
3572             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3573         }else
3574             bits+= esc_length;
3575
3576     }
3577
3578     if(last>=0){
3579         if(s->mb_intra)
3580             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3581         else
3582             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3583     }
3584
3585     s->dsp.idct_add(bak, stride, temp);
3586
3587     distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3588
3589     return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3590 }
3591
3592 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3593     MpegEncContext * const s= (MpegEncContext *)c;
3594     const uint8_t *scantable= s->intra_scantable.permutated;
3595     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3596     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3597     int i, last, run, bits, level, start_i;
3598     const int esc_length= s->ac_esc_length;
3599     uint8_t * length;
3600     uint8_t * last_length;
3601
3602     assert(h==8);
3603
3604     s->dsp.diff_pixels(temp, src1, src2, stride);
3605
3606     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3607
3608     bits=0;
3609
3610     if (s->mb_intra) {
3611         start_i = 1;
3612         length     = s->intra_ac_vlc_length;
3613         last_length= s->intra_ac_vlc_last_length;
3614         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3615     } else {
3616         start_i = 0;
3617         length     = s->inter_ac_vlc_length;
3618         last_length= s->inter_ac_vlc_last_length;
3619     }
3620
3621     if(last>=start_i){
3622         run=0;
3623         for(i=start_i; i<last; i++){
3624             int j= scantable[i];
3625             level= temp[j];
3626
3627             if(level){
3628                 level+=64;
3629                 if((level&(~127)) == 0){
3630                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3631                 }else
3632                     bits+= esc_length;
3633                 run=0;
3634             }else
3635                 run++;
3636         }
3637         i= scantable[last];
3638
3639         level= temp[i] + 64;
3640
3641         assert(level - 64);
3642
3643         if((level&(~127)) == 0){
3644             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3645         }else
3646             bits+= esc_length;
3647     }
3648
3649     return bits;
3650 }
3651
3652 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3653     int score=0;
3654     int x,y;
3655
3656     for(y=1; y<h; y++){
3657         for(x=0; x<16; x+=4){
3658             score+= ABS(s[x  ] - s[x  +stride]) + ABS(s[x+1] - s[x+1+stride])
3659                    +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3660         }
3661         s+= stride;
3662     }
3663
3664     return score;
3665 }
3666
3667 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3668     int score=0;
3669     int x,y;
3670
3671     for(y=1; y<h; y++){
3672         for(x=0; x<16; x++){
3673             score+= ABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3674         }
3675         s1+= stride;
3676         s2+= stride;
3677     }
3678
3679     return score;
3680 }
3681
3682 #define SQ(a) ((a)*(a))
3683 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3684     int score=0;
3685     int x,y;
3686
3687     for(y=1; y<h; y++){
3688         for(x=0; x<16; x+=4){
3689             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3690                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3691         }
3692         s+= stride;
3693     }
3694
3695     return score;
3696 }
3697
3698 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3699     int score=0;
3700     int x,y;
3701
3702     for(y=1; y<h; y++){
3703         for(x=0; x<16; x++){
3704             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3705         }
3706         s1+= stride;
3707         s2+= stride;
3708     }
3709
3710     return score;
3711 }
3712
3713 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3714 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3715 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3716 #ifdef CONFIG_GPL
3717 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3718 #endif
3719 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3720 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3721 WARPER8_16_SQ(rd8x8_c, rd16_c)
3722 WARPER8_16_SQ(bit8x8_c, bit16_c)
3723
3724 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3725  converted */
3726 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3727 {
3728     j_rev_dct (block);
3729     put_pixels_clamped_c(block, dest, line_size);
3730 }
3731 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3732 {
3733     j_rev_dct (block);
3734     add_pixels_clamped_c(block, dest, line_size);
3735 }
3736
3737 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3738 {
3739     j_rev_dct4 (block);
3740     put_pixels_clamped4_c(block, dest, line_size);
3741 }
3742 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3743 {
3744     j_rev_dct4 (block);
3745     add_pixels_clamped4_c(block, dest, line_size);
3746 }
3747
3748 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3749 {
3750     j_rev_dct2 (block);
3751     put_pixels_clamped2_c(block, dest, line_size);
3752 }
3753 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3754 {
3755     j_rev_dct2 (block);
3756     add_pixels_clamped2_c(block, dest, line_size);
3757 }
3758
3759 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3760 {
3761     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3762
3763     dest[0] = cm[(block[0] + 4)>>3];
3764 }
3765 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3766 {
3767     uint8_t *cm = cropTbl + MAX_NEG_CROP;
3768
3769     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3770 }
3771
3772 static void just_return() { return; }
3773
3774 /* init static data */
3775 void dsputil_static_init(void)
3776 {
3777     int i;
3778
3779     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3780     for(i=0;i<MAX_NEG_CROP;i++) {
3781         cropTbl[i] = 0;
3782         cropTbl[i + MAX_NEG_CROP + 256] = 255;
3783     }
3784
3785     for(i=0;i<512;i++) {
3786         squareTbl[i] = (i - 256) * (i - 256);
3787     }
3788
3789     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3790 }
3791
3792
3793 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3794 {
3795     int i;
3796
3797 #ifdef CONFIG_ENCODERS
3798     if(avctx->dct_algo==FF_DCT_FASTINT) {
3799         c->fdct = fdct_ifast;
3800         c->fdct248 = fdct_ifast248;
3801     }
3802     else if(avctx->dct_algo==FF_DCT_FAAN) {
3803         c->fdct = ff_faandct;
3804         c->fdct248 = ff_faandct248;
3805     }
3806     else {
3807         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3808         c->fdct248 = ff_fdct248_islow;
3809     }
3810 #endif //CONFIG_ENCODERS
3811
3812     if(avctx->lowres==1){
3813         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3814             c->idct_put= ff_jref_idct4_put;
3815             c->idct_add= ff_jref_idct4_add;
3816         }else{
3817             c->idct_put= ff_h264_lowres_idct_put_c;
3818             c->idct_add= ff_h264_lowres_idct_add_c;
3819         }
3820         c->idct    = j_rev_dct4;
3821         c->idct_permutation_type= FF_NO_IDCT_PERM;
3822     }else if(avctx->lowres==2){
3823         c->idct_put= ff_jref_idct2_put;
3824         c->idct_add= ff_jref_idct2_add;
3825         c->idct    = j_rev_dct2;
3826         c->idct_permutation_type= FF_NO_IDCT_PERM;
3827     }else if(avctx->lowres==3){
3828         c->idct_put= ff_jref_idct1_put;
3829         c->idct_add= ff_jref_idct1_add;
3830         c->idct    = j_rev_dct1;
3831         c->idct_permutation_type= FF_NO_IDCT_PERM;
3832     }else{
3833         if(avctx->idct_algo==FF_IDCT_INT){
3834             c->idct_put= ff_jref_idct_put;
3835             c->idct_add= ff_jref_idct_add;
3836             c->idct    = j_rev_dct;
3837             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3838         }else if(avctx->idct_algo==FF_IDCT_VP3){
3839             c->idct_put= ff_vp3_idct_put_c;
3840             c->idct_add= ff_vp3_idct_add_c;
3841             c->idct    = ff_vp3_idct_c;
3842             c->idct_permutation_type= FF_NO_IDCT_PERM;
3843         }else{ //accurate/default
3844             c->idct_put= simple_idct_put;
3845             c->idct_add= simple_idct_add;
3846             c->idct    = simple_idct;
3847             c->idct_permutation_type= FF_NO_IDCT_PERM;
3848         }
3849     }
3850
3851     c->h264_idct_add= ff_h264_idct_add_c;
3852     c->h264_idct8_add= ff_h264_idct8_add_c;
3853     c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3854     c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3855
3856     c->get_pixels = get_pixels_c;
3857     c->diff_pixels = diff_pixels_c;
3858     c->put_pixels_clamped = put_pixels_clamped_c;
3859     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3860     c->add_pixels_clamped = add_pixels_clamped_c;
3861     c->add_pixels8 = add_pixels8_c;
3862     c->add_pixels4 = add_pixels4_c;
3863     c->gmc1 = gmc1_c;
3864     c->gmc = ff_gmc_c;
3865     c->clear_blocks = clear_blocks_c;
3866     c->pix_sum = pix_sum_c;
3867     c->pix_norm1 = pix_norm1_c;
3868
3869     /* TODO [0] 16  [1] 8 */
3870     c->pix_abs[0][0] = pix_abs16_c;
3871     c->pix_abs[0][1] = pix_abs16_x2_c;
3872     c->pix_abs[0][2] = pix_abs16_y2_c;
3873     c->pix_abs[0][3] = pix_abs16_xy2_c;
3874     c->pix_abs[1][0] = pix_abs8_c;
3875     c->pix_abs[1][1] = pix_abs8_x2_c;
3876     c->pix_abs[1][2] = pix_abs8_y2_c;
3877     c->pix_abs[1][3] = pix_abs8_xy2_c;
3878
3879 #define dspfunc(PFX, IDX, NUM) \
3880     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
3881     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
3882     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
3883     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3884
3885     dspfunc(put, 0, 16);
3886     dspfunc(put_no_rnd, 0, 16);
3887     dspfunc(put, 1, 8);
3888     dspfunc(put_no_rnd, 1, 8);
3889     dspfunc(put, 2, 4);
3890     dspfunc(put, 3, 2);
3891
3892     dspfunc(avg, 0, 16);
3893     dspfunc(avg_no_rnd, 0, 16);
3894     dspfunc(avg, 1, 8);
3895     dspfunc(avg_no_rnd, 1, 8);
3896     dspfunc(avg, 2, 4);
3897     dspfunc(avg, 3, 2);
3898 #undef dspfunc
3899
3900     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3901     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3902
3903     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3904     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3905     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3906     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3907     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3908     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3909     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3910     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3911     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3912
3913     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3914     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3915     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3916     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3917     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3918     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3919     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3920     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3921     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3922
3923 #define dspfunc(PFX, IDX, NUM) \
3924     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3925     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3926     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3927     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3928     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3929     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3930     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3931     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3932     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3933     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3934     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3935     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3936     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3937     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3938     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3939     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3940
3941     dspfunc(put_qpel, 0, 16);
3942     dspfunc(put_no_rnd_qpel, 0, 16);
3943
3944     dspfunc(avg_qpel, 0, 16);
3945     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3946
3947     dspfunc(put_qpel, 1, 8);
3948     dspfunc(put_no_rnd_qpel, 1, 8);
3949
3950     dspfunc(avg_qpel, 1, 8);
3951     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3952
3953     dspfunc(put_h264_qpel, 0, 16);
3954     dspfunc(put_h264_qpel, 1, 8);
3955     dspfunc(put_h264_qpel, 2, 4);
3956     dspfunc(put_h264_qpel, 3, 2);
3957     dspfunc(avg_h264_qpel, 0, 16);
3958     dspfunc(avg_h264_qpel, 1, 8);
3959     dspfunc(avg_h264_qpel, 2, 4);
3960
3961 #undef dspfunc
3962     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3963     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3964     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3965     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3966     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3967     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3968
3969     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3970     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3971     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3972     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3973     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3974     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3975     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3976     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3977     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3978     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3979     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3980     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3981     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3982     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3983     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3984     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3985     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3986     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3987     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3988     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3989
3990     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3991     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3992     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3993     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3994     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3995     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3996     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3997     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3998
3999 #define SET_CMP_FUNC(name) \
4000     c->name[0]= name ## 16_c;\
4001     c->name[1]= name ## 8x8_c;
4002
4003     SET_CMP_FUNC(hadamard8_diff)
4004     c->hadamard8_diff[4]= hadamard8_intra16_c;
4005     SET_CMP_FUNC(dct_sad)
4006     SET_CMP_FUNC(dct_max)
4007 #ifdef CONFIG_GPL
4008     SET_CMP_FUNC(dct264_sad)
4009 #endif
4010     c->sad[0]= pix_abs16_c;
4011     c->sad[1]= pix_abs8_c;
4012     c->sse[0]= sse16_c;
4013     c->sse[1]= sse8_c;
4014     c->sse[2]= sse4_c;
4015     SET_CMP_FUNC(quant_psnr)
4016     SET_CMP_FUNC(rd)
4017     SET_CMP_FUNC(bit)
4018     c->vsad[0]= vsad16_c;
4019     c->vsad[4]= vsad_intra16_c;
4020     c->vsse[0]= vsse16_c;
4021     c->vsse[4]= vsse_intra16_c;
4022     c->nsse[0]= nsse16_c;
4023     c->nsse[1]= nsse8_c;
4024     c->w53[0]= w53_16_c;
4025     c->w53[1]= w53_8_c;
4026     c->w97[0]= w97_16_c;
4027     c->w97[1]= w97_8_c;
4028
4029     c->add_bytes= add_bytes_c;
4030     c->diff_bytes= diff_bytes_c;
4031     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4032     c->bswap_buf= bswap_buf;
4033
4034     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4035     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4036     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4037     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4038     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4039     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4040
4041     c->h263_h_loop_filter= h263_h_loop_filter_c;
4042     c->h263_v_loop_filter= h263_v_loop_filter_c;
4043
4044     c->h261_loop_filter= h261_loop_filter_c;
4045
4046     c->try_8x8basis= try_8x8basis_c;
4047     c->add_8x8basis= add_8x8basis_c;
4048
4049 #ifdef CONFIG_SNOW_ENCODER
4050     c->vertical_compose97i = ff_snow_vertical_compose97i;
4051     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4052     c->inner_add_yblock = ff_snow_inner_add_yblock;
4053 #endif
4054
4055     c->shrink[0]= ff_img_copy_plane;
4056     c->shrink[1]= ff_shrink22;
4057     c->shrink[2]= ff_shrink44;
4058     c->shrink[3]= ff_shrink88;
4059
4060     c->prefetch= just_return;
4061
4062 #ifdef HAVE_MMX
4063     dsputil_init_mmx(c, avctx);
4064 #endif
4065 #ifdef ARCH_ARMV4L
4066     dsputil_init_armv4l(c, avctx);
4067 #endif
4068 #ifdef HAVE_MLIB
4069     dsputil_init_mlib(c, avctx);
4070 #endif
4071 #ifdef ARCH_SPARC
4072    dsputil_init_vis(c,avctx);
4073 #endif
4074 #ifdef ARCH_ALPHA
4075     dsputil_init_alpha(c, avctx);
4076 #endif
4077 #ifdef ARCH_POWERPC
4078     dsputil_init_ppc(c, avctx);
4079 #endif
4080 #ifdef HAVE_MMI
4081     dsputil_init_mmi(c, avctx);
4082 #endif
4083 #ifdef ARCH_SH4
4084     dsputil_init_sh4(c,avctx);
4085 #endif
4086
4087     switch(c->idct_permutation_type){
4088     case FF_NO_IDCT_PERM:
4089         for(i=0; i<64; i++)
4090             c->idct_permutation[i]= i;
4091         break;
4092     case FF_LIBMPEG2_IDCT_PERM:
4093         for(i=0; i<64; i++)
4094             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4095         break;
4096     case FF_SIMPLE_IDCT_PERM:
4097         for(i=0; i<64; i++)
4098             c->idct_permutation[i]= simple_mmx_permutation[i];
4099         break;
4100     case FF_TRANSPOSE_IDCT_PERM:
4101         for(i=0; i<64; i++)
4102             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4103         break;
4104     case FF_PARTTRANS_IDCT_PERM:
4105         for(i=0; i<64; i++)
4106             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4107         break;
4108     default:
4109         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4110     }
4111 }
4112