libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard.
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "h263.h"
  36 #include "snow.h"
  37
  38 /* snow.c */
  39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  40
  41 /* vorbis.c */
  42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  43
  44 /* ac3dec.c */
  45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  46
  47 /* flacenc.c */
  48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  49
  50 /* pngdec.c */
  51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  52
  53 /* eaidct.c */
  54 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  55
  56 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  57 uint32_t ff_squareTbl[512] = {0, };
  58
  59 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  60 #define pb_7f (~0UL/255 * 0x7f)
  61 #define pb_80 (~0UL/255 * 0x80)
  62
  63 const uint8_t ff_zigzag_direct[64] = {
  64     0,   1,  8, 16,  9,  2,  3, 10,
  65     17, 24, 32, 25, 18, 11,  4,  5,
  66     12, 19, 26, 33, 40, 48, 41, 34,
  67     27, 20, 13,  6,  7, 14, 21, 28,
  68     35, 42, 49, 56, 57, 50, 43, 36,
  69     29, 22, 15, 23, 30, 37, 44, 51,
  70     58, 59, 52, 45, 38, 31, 39, 46,
  71     53, 60, 61, 54, 47, 55, 62, 63
  72 };
  73
  74 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  75    specification, we interleave the fields */
  76 const uint8_t ff_zigzag248_direct[64] = {
  77      0,  8,  1,  9, 16, 24,  2, 10,
  78     17, 25, 32, 40, 48, 56, 33, 41,
  79     18, 26,  3, 11,  4, 12, 19, 27,
  80     34, 42, 49, 57, 50, 58, 35, 43,
  81     20, 28,  5, 13,  6, 14, 21, 29,
  82     36, 44, 51, 59, 52, 60, 37, 45,
  83     22, 30,  7, 15, 23, 31, 38, 46,
  84     53, 61, 54, 62, 39, 47, 55, 63,
  85 };
  86
  87 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  88 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  89
  90 const uint8_t ff_alternate_horizontal_scan[64] = {
  91     0,  1,   2,  3,  8,  9, 16, 17,
  92     10, 11,  4,  5,  6,  7, 15, 14,
  93     13, 12, 19, 18, 24, 25, 32, 33,
  94     26, 27, 20, 21, 22, 23, 28, 29,
  95     30, 31, 34, 35, 40, 41, 48, 49,
  96     42, 43, 36, 37, 38, 39, 44, 45,
  97     46, 47, 50, 51, 56, 57, 58, 59,
  98     52, 53, 54, 55, 60, 61, 62, 63,
  99 };
 100
 101 const uint8_t ff_alternate_vertical_scan[64] = {
 102     0,  8,  16, 24,  1,  9,  2, 10,
 103     17, 25, 32, 40, 48, 56, 57, 49,
 104     41, 33, 26, 18,  3, 11,  4, 12,
 105     19, 27, 34, 42, 50, 58, 35, 43,
 106     51, 59, 20, 28,  5, 13,  6, 14,
 107     21, 29, 36, 44, 52, 60, 37, 45,
 108     53, 61, 22, 30,  7, 15, 23, 31,
 109     38, 46, 54, 62, 39, 47, 55, 63,
 110 };
 111
 112 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 113 const uint32_t ff_inverse[256]={
 114          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 115  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 116  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 117  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 118  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 119  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 120   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 121   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 122   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 123   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 124   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 125   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 126   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 127   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 128   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 129   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 130   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 131   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 132   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 133   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 134   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 135   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 136   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 137   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 138   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 139   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 140   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 141   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 142   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 143   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 144   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 145   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 146 };
 147
 148 /* Input permutation for the simple_idct_mmx */
 149 static const uint8_t simple_mmx_permutation[64]={
 150         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 151         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 152         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 153         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 154         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 155         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 156         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 157         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 158 };
 159
 160 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 161
 162 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 163     int i;
 164     int end;
 165
 166     st->scantable= src_scantable;
 167
 168     for(i=0; i<64; i++){
 169         int j;
 170         j = src_scantable[i];
 171         st->permutated[i] = permutation[j];
 172 #ifdef ARCH_POWERPC
 173         st->inverse[j] = i;
 174 #endif
 175     }
 176
 177     end=-1;
 178     for(i=0; i<64; i++){
 179         int j;
 180         j = st->permutated[i];
 181         if(j>end) end=j;
 182         st->raster_end[i]= end;
 183     }
 184 }
 185
 186 static int pix_sum_c(uint8_t * pix, int line_size)
 187 {
 188     int s, i, j;
 189
 190     s = 0;
 191     for (i = 0; i < 16; i++) {
 192         for (j = 0; j < 16; j += 8) {
 193             s += pix[0];
 194             s += pix[1];
 195             s += pix[2];
 196             s += pix[3];
 197             s += pix[4];
 198             s += pix[5];
 199             s += pix[6];
 200             s += pix[7];
 201             pix += 8;
 202         }
 203         pix += line_size - 16;
 204     }
 205     return s;
 206 }
 207
 208 static int pix_norm1_c(uint8_t * pix, int line_size)
 209 {
 210     int s, i, j;
 211     uint32_t *sq = ff_squareTbl + 256;
 212
 213     s = 0;
 214     for (i = 0; i < 16; i++) {
 215         for (j = 0; j < 16; j += 8) {
 216 #if 0
 217             s += sq[pix[0]];
 218             s += sq[pix[1]];
 219             s += sq[pix[2]];
 220             s += sq[pix[3]];
 221             s += sq[pix[4]];
 222             s += sq[pix[5]];
 223             s += sq[pix[6]];
 224             s += sq[pix[7]];
 225 #else
 226 #if LONG_MAX > 2147483647
 227             register uint64_t x=*(uint64_t*)pix;
 228             s += sq[x&0xff];
 229             s += sq[(x>>8)&0xff];
 230             s += sq[(x>>16)&0xff];
 231             s += sq[(x>>24)&0xff];
 232             s += sq[(x>>32)&0xff];
 233             s += sq[(x>>40)&0xff];
 234             s += sq[(x>>48)&0xff];
 235             s += sq[(x>>56)&0xff];
 236 #else
 237             register uint32_t x=*(uint32_t*)pix;
 238             s += sq[x&0xff];
 239             s += sq[(x>>8)&0xff];
 240             s += sq[(x>>16)&0xff];
 241             s += sq[(x>>24)&0xff];
 242             x=*(uint32_t*)(pix+4);
 243             s += sq[x&0xff];
 244             s += sq[(x>>8)&0xff];
 245             s += sq[(x>>16)&0xff];
 246             s += sq[(x>>24)&0xff];
 247 #endif
 248 #endif
 249             pix += 8;
 250         }
 251         pix += line_size - 16;
 252     }
 253     return s;
 254 }
 255
 256 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 257     int i;
 258
 259     for(i=0; i+8<=w; i+=8){
 260         dst[i+0]= bswap_32(src[i+0]);
 261         dst[i+1]= bswap_32(src[i+1]);
 262         dst[i+2]= bswap_32(src[i+2]);
 263         dst[i+3]= bswap_32(src[i+3]);
 264         dst[i+4]= bswap_32(src[i+4]);
 265         dst[i+5]= bswap_32(src[i+5]);
 266         dst[i+6]= bswap_32(src[i+6]);
 267         dst[i+7]= bswap_32(src[i+7]);
 268     }
 269     for(;i<w; i++){
 270         dst[i+0]= bswap_32(src[i+0]);
 271     }
 272 }
 273
 274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 275 {
 276     int s, i;
 277     uint32_t *sq = ff_squareTbl + 256;
 278
 279     s = 0;
 280     for (i = 0; i < h; i++) {
 281         s += sq[pix1[0] - pix2[0]];
 282         s += sq[pix1[1] - pix2[1]];
 283         s += sq[pix1[2] - pix2[2]];
 284         s += sq[pix1[3] - pix2[3]];
 285         pix1 += line_size;
 286         pix2 += line_size;
 287     }
 288     return s;
 289 }
 290
 291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 292 {
 293     int s, i;
 294     uint32_t *sq = ff_squareTbl + 256;
 295
 296     s = 0;
 297     for (i = 0; i < h; i++) {
 298         s += sq[pix1[0] - pix2[0]];
 299         s += sq[pix1[1] - pix2[1]];
 300         s += sq[pix1[2] - pix2[2]];
 301         s += sq[pix1[3] - pix2[3]];
 302         s += sq[pix1[4] - pix2[4]];
 303         s += sq[pix1[5] - pix2[5]];
 304         s += sq[pix1[6] - pix2[6]];
 305         s += sq[pix1[7] - pix2[7]];
 306         pix1 += line_size;
 307         pix2 += line_size;
 308     }
 309     return s;
 310 }
 311
 312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 313 {
 314     int s, i;
 315     uint32_t *sq = ff_squareTbl + 256;
 316
 317     s = 0;
 318     for (i = 0; i < h; i++) {
 319         s += sq[pix1[ 0] - pix2[ 0]];
 320         s += sq[pix1[ 1] - pix2[ 1]];
 321         s += sq[pix1[ 2] - pix2[ 2]];
 322         s += sq[pix1[ 3] - pix2[ 3]];
 323         s += sq[pix1[ 4] - pix2[ 4]];
 324         s += sq[pix1[ 5] - pix2[ 5]];
 325         s += sq[pix1[ 6] - pix2[ 6]];
 326         s += sq[pix1[ 7] - pix2[ 7]];
 327         s += sq[pix1[ 8] - pix2[ 8]];
 328         s += sq[pix1[ 9] - pix2[ 9]];
 329         s += sq[pix1[10] - pix2[10]];
 330         s += sq[pix1[11] - pix2[11]];
 331         s += sq[pix1[12] - pix2[12]];
 332         s += sq[pix1[13] - pix2[13]];
 333         s += sq[pix1[14] - pix2[14]];
 334         s += sq[pix1[15] - pix2[15]];
 335
 336         pix1 += line_size;
 337         pix2 += line_size;
 338     }
 339     return s;
 340 }
 341
 342
 343 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
 344 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 345     int s, i, j;
 346     const int dec_count= w==8 ? 3 : 4;
 347     int tmp[32*32];
 348     int level, ori;
 349     static const int scale[2][2][4][4]={
 350       {
 351         {
 352             // 9/7 8x8 dec=3
 353             {268, 239, 239, 213},
 354             {  0, 224, 224, 152},
 355             {  0, 135, 135, 110},
 356         },{
 357             // 9/7 16x16 or 32x32 dec=4
 358             {344, 310, 310, 280},
 359             {  0, 320, 320, 228},
 360             {  0, 175, 175, 136},
 361             {  0, 129, 129, 102},
 362         }
 363       },{
 364         {
 365             // 5/3 8x8 dec=3
 366             {275, 245, 245, 218},
 367             {  0, 230, 230, 156},
 368             {  0, 138, 138, 113},
 369         },{
 370             // 5/3 16x16 or 32x32 dec=4
 371             {352, 317, 317, 286},
 372             {  0, 328, 328, 233},
 373             {  0, 180, 180, 140},
 374             {  0, 132, 132, 105},
 375         }
 376       }
 377     };
 378
 379     for (i = 0; i < h; i++) {
 380         for (j = 0; j < w; j+=4) {
 381             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 382             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 383             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 384             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 385         }
 386         pix1 += line_size;
 387         pix2 += line_size;
 388     }
 389
 390     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 391
 392     s=0;
 393     assert(w==h);
 394     for(level=0; level<dec_count; level++){
 395         for(ori= level ? 1 : 0; ori<4; ori++){
 396             int size= w>>(dec_count-level);
 397             int sx= (ori&1) ? size : 0;
 398             int stride= 32<<(dec_count-level);
 399             int sy= (ori&2) ? stride>>1 : 0;
 400
 401             for(i=0; i<size; i++){
 402                 for(j=0; j<size; j++){
 403                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 404                     s += FFABS(v);
 405                 }
 406             }
 407         }
 408     }
 409     assert(s>=0);
 410     return s>>9;
 411 }
 412
 413 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 414     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 415 }
 416
 417 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 418     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 419 }
 420
 421 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 422     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 423 }
 424
 425 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 426     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 427 }
 428
 429 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 430     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 431 }
 432
 433 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 434     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 435 }
 436 #endif
 437
 438 /* draw the edges of width 'w' of an image of size width, height */
 439 //FIXME check that this is ok for mpeg4 interlaced
 440 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 441 {
 442     uint8_t *ptr, *last_line;
 443     int i;
 444
 445     last_line = buf + (height - 1) * wrap;
 446     for(i=0;i<w;i++) {
 447         /* top and bottom */
 448         memcpy(buf - (i + 1) * wrap, buf, width);
 449         memcpy(last_line + (i + 1) * wrap, last_line, width);
 450     }
 451     /* left and right */
 452     ptr = buf;
 453     for(i=0;i<height;i++) {
 454         memset(ptr - w, ptr[0], w);
 455         memset(ptr + width, ptr[width-1], w);
 456         ptr += wrap;
 457     }
 458     /* corners */
 459     for(i=0;i<w;i++) {
 460         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 461         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 462         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 463         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 464     }
 465 }
 466
 467 /**
 468  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 469  * @param buf destination buffer
 470  * @param src source buffer
 471  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 472  * @param block_w width of block
 473  * @param block_h height of block
 474  * @param src_x x coordinate of the top left sample of the block in the source buffer
 475  * @param src_y y coordinate of the top left sample of the block in the source buffer
 476  * @param w width of the source buffer
 477  * @param h height of the source buffer
 478  */
 479 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 480                                     int src_x, int src_y, int w, int h){
 481     int x, y;
 482     int start_y, start_x, end_y, end_x;
 483
 484     if(src_y>= h){
 485         src+= (h-1-src_y)*linesize;
 486         src_y=h-1;
 487     }else if(src_y<=-block_h){
 488         src+= (1-block_h-src_y)*linesize;
 489         src_y=1-block_h;
 490     }
 491     if(src_x>= w){
 492         src+= (w-1-src_x);
 493         src_x=w-1;
 494     }else if(src_x<=-block_w){
 495         src+= (1-block_w-src_x);
 496         src_x=1-block_w;
 497     }
 498
 499     start_y= FFMAX(0, -src_y);
 500     start_x= FFMAX(0, -src_x);
 501     end_y= FFMIN(block_h, h-src_y);
 502     end_x= FFMIN(block_w, w-src_x);
 503
 504     // copy existing part
 505     for(y=start_y; y<end_y; y++){
 506         for(x=start_x; x<end_x; x++){
 507             buf[x + y*linesize]= src[x + y*linesize];
 508         }
 509     }
 510
 511     //top
 512     for(y=0; y<start_y; y++){
 513         for(x=start_x; x<end_x; x++){
 514             buf[x + y*linesize]= buf[x + start_y*linesize];
 515         }
 516     }
 517
 518     //bottom
 519     for(y=end_y; y<block_h; y++){
 520         for(x=start_x; x<end_x; x++){
 521             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 522         }
 523     }
 524
 525     for(y=0; y<block_h; y++){
 526        //left
 527         for(x=0; x<start_x; x++){
 528             buf[x + y*linesize]= buf[start_x + y*linesize];
 529         }
 530
 531        //right
 532         for(x=end_x; x<block_w; x++){
 533             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 534         }
 535     }
 536 }
 537
 538 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 539 {
 540     int i;
 541
 542     /* read the pixels */
 543     for(i=0;i<8;i++) {
 544         block[0] = pixels[0];
 545         block[1] = pixels[1];
 546         block[2] = pixels[2];
 547         block[3] = pixels[3];
 548         block[4] = pixels[4];
 549         block[5] = pixels[5];
 550         block[6] = pixels[6];
 551         block[7] = pixels[7];
 552         pixels += line_size;
 553         block += 8;
 554     }
 555 }
 556
 557 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 558                           const uint8_t *s2, int stride){
 559     int i;
 560
 561     /* read the pixels */
 562     for(i=0;i<8;i++) {
 563         block[0] = s1[0] - s2[0];
 564         block[1] = s1[1] - s2[1];
 565         block[2] = s1[2] - s2[2];
 566         block[3] = s1[3] - s2[3];
 567         block[4] = s1[4] - s2[4];
 568         block[5] = s1[5] - s2[5];
 569         block[6] = s1[6] - s2[6];
 570         block[7] = s1[7] - s2[7];
 571         s1 += stride;
 572         s2 += stride;
 573         block += 8;
 574     }
 575 }
 576
 577
 578 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 579                                  int line_size)
 580 {
 581     int i;
 582     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 583
 584     /* read the pixels */
 585     for(i=0;i<8;i++) {
 586         pixels[0] = cm[block[0]];
 587         pixels[1] = cm[block[1]];
 588         pixels[2] = cm[block[2]];
 589         pixels[3] = cm[block[3]];
 590         pixels[4] = cm[block[4]];
 591         pixels[5] = cm[block[5]];
 592         pixels[6] = cm[block[6]];
 593         pixels[7] = cm[block[7]];
 594
 595         pixels += line_size;
 596         block += 8;
 597     }
 598 }
 599
 600 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 601                                  int line_size)
 602 {
 603     int i;
 604     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 605
 606     /* read the pixels */
 607     for(i=0;i<4;i++) {
 608         pixels[0] = cm[block[0]];
 609         pixels[1] = cm[block[1]];
 610         pixels[2] = cm[block[2]];
 611         pixels[3] = cm[block[3]];
 612
 613         pixels += line_size;
 614         block += 8;
 615     }
 616 }
 617
 618 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 619                                  int line_size)
 620 {
 621     int i;
 622     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 623
 624     /* read the pixels */
 625     for(i=0;i<2;i++) {
 626         pixels[0] = cm[block[0]];
 627         pixels[1] = cm[block[1]];
 628
 629         pixels += line_size;
 630         block += 8;
 631     }
 632 }
 633
 634 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 635                                         uint8_t *restrict pixels,
 636                                         int line_size)
 637 {
 638     int i, j;
 639
 640     for (i = 0; i < 8; i++) {
 641         for (j = 0; j < 8; j++) {
 642             if (*block < -128)
 643                 *pixels = 0;
 644             else if (*block > 127)
 645                 *pixels = 255;
 646             else
 647                 *pixels = (uint8_t)(*block + 128);
 648             block++;
 649             pixels++;
 650         }
 651         pixels += (line_size - 8);
 652     }
 653 }
 654
 655 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 656                           int line_size)
 657 {
 658     int i;
 659     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 660
 661     /* read the pixels */
 662     for(i=0;i<8;i++) {
 663         pixels[0] = cm[pixels[0] + block[0]];
 664         pixels[1] = cm[pixels[1] + block[1]];
 665         pixels[2] = cm[pixels[2] + block[2]];
 666         pixels[3] = cm[pixels[3] + block[3]];
 667         pixels[4] = cm[pixels[4] + block[4]];
 668         pixels[5] = cm[pixels[5] + block[5]];
 669         pixels[6] = cm[pixels[6] + block[6]];
 670         pixels[7] = cm[pixels[7] + block[7]];
 671         pixels += line_size;
 672         block += 8;
 673     }
 674 }
 675
 676 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 677                           int line_size)
 678 {
 679     int i;
 680     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 681
 682     /* read the pixels */
 683     for(i=0;i<4;i++) {
 684         pixels[0] = cm[pixels[0] + block[0]];
 685         pixels[1] = cm[pixels[1] + block[1]];
 686         pixels[2] = cm[pixels[2] + block[2]];
 687         pixels[3] = cm[pixels[3] + block[3]];
 688         pixels += line_size;
 689         block += 8;
 690     }
 691 }
 692
 693 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 694                           int line_size)
 695 {
 696     int i;
 697     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 698
 699     /* read the pixels */
 700     for(i=0;i<2;i++) {
 701         pixels[0] = cm[pixels[0] + block[0]];
 702         pixels[1] = cm[pixels[1] + block[1]];
 703         pixels += line_size;
 704         block += 8;
 705     }
 706 }
 707
 708 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 709 {
 710     int i;
 711     for(i=0;i<8;i++) {
 712         pixels[0] += block[0];
 713         pixels[1] += block[1];
 714         pixels[2] += block[2];
 715         pixels[3] += block[3];
 716         pixels[4] += block[4];
 717         pixels[5] += block[5];
 718         pixels[6] += block[6];
 719         pixels[7] += block[7];
 720         pixels += line_size;
 721         block += 8;
 722     }
 723 }
 724
 725 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 726 {
 727     int i;
 728     for(i=0;i<4;i++) {
 729         pixels[0] += block[0];
 730         pixels[1] += block[1];
 731         pixels[2] += block[2];
 732         pixels[3] += block[3];
 733         pixels += line_size;
 734         block += 4;
 735     }
 736 }
 737
 738 static int sum_abs_dctelem_c(DCTELEM *block)
 739 {
 740     int sum=0, i;
 741     for(i=0; i<64; i++)
 742         sum+= FFABS(block[i]);
 743     return sum;
 744 }
 745
 746 #if 0
 747
 748 #define PIXOP2(OPNAME, OP) \
 749 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 750 {\
 751     int i;\
 752     for(i=0; i<h; i++){\
 753         OP(*((uint64_t*)block), AV_RN64(pixels));\
 754         pixels+=line_size;\
 755         block +=line_size;\
 756     }\
 757 }\
 758 \
 759 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 760 {\
 761     int i;\
 762     for(i=0; i<h; i++){\
 763         const uint64_t a= AV_RN64(pixels  );\
 764         const uint64_t b= AV_RN64(pixels+1);\
 765         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 766         pixels+=line_size;\
 767         block +=line_size;\
 768     }\
 769 }\
 770 \
 771 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 772 {\
 773     int i;\
 774     for(i=0; i<h; i++){\
 775         const uint64_t a= AV_RN64(pixels  );\
 776         const uint64_t b= AV_RN64(pixels+1);\
 777         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 778         pixels+=line_size;\
 779         block +=line_size;\
 780     }\
 781 }\
 782 \
 783 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 784 {\
 785     int i;\
 786     for(i=0; i<h; i++){\
 787         const uint64_t a= AV_RN64(pixels          );\
 788         const uint64_t b= AV_RN64(pixels+line_size);\
 789         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 790         pixels+=line_size;\
 791         block +=line_size;\
 792     }\
 793 }\
 794 \
 795 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 796 {\
 797     int i;\
 798     for(i=0; i<h; i++){\
 799         const uint64_t a= AV_RN64(pixels          );\
 800         const uint64_t b= AV_RN64(pixels+line_size);\
 801         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 802         pixels+=line_size;\
 803         block +=line_size;\
 804     }\
 805 }\
 806 \
 807 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 808 {\
 809         int i;\
 810         const uint64_t a= AV_RN64(pixels  );\
 811         const uint64_t b= AV_RN64(pixels+1);\
 812         uint64_t l0=  (a&0x0303030303030303ULL)\
 813                     + (b&0x0303030303030303ULL)\
 814                     + 0x0202020202020202ULL;\
 815         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 816                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 817         uint64_t l1,h1;\
 818 \
 819         pixels+=line_size;\
 820         for(i=0; i<h; i+=2){\
 821             uint64_t a= AV_RN64(pixels  );\
 822             uint64_t b= AV_RN64(pixels+1);\
 823             l1=  (a&0x0303030303030303ULL)\
 824                + (b&0x0303030303030303ULL);\
 825             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 826               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 827             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 828             pixels+=line_size;\
 829             block +=line_size;\
 830             a= AV_RN64(pixels  );\
 831             b= AV_RN64(pixels+1);\
 832             l0=  (a&0x0303030303030303ULL)\
 833                + (b&0x0303030303030303ULL)\
 834                + 0x0202020202020202ULL;\
 835             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 836               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 837             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 838             pixels+=line_size;\
 839             block +=line_size;\
 840         }\
 841 }\
 842 \
 843 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 844 {\
 845         int i;\
 846         const uint64_t a= AV_RN64(pixels  );\
 847         const uint64_t b= AV_RN64(pixels+1);\
 848         uint64_t l0=  (a&0x0303030303030303ULL)\
 849                     + (b&0x0303030303030303ULL)\
 850                     + 0x0101010101010101ULL;\
 851         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 852                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 853         uint64_t l1,h1;\
 854 \
 855         pixels+=line_size;\
 856         for(i=0; i<h; i+=2){\
 857             uint64_t a= AV_RN64(pixels  );\
 858             uint64_t b= AV_RN64(pixels+1);\
 859             l1=  (a&0x0303030303030303ULL)\
 860                + (b&0x0303030303030303ULL);\
 861             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 862               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 863             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 864             pixels+=line_size;\
 865             block +=line_size;\
 866             a= AV_RN64(pixels  );\
 867             b= AV_RN64(pixels+1);\
 868             l0=  (a&0x0303030303030303ULL)\
 869                + (b&0x0303030303030303ULL)\
 870                + 0x0101010101010101ULL;\
 871             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 872               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 873             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 874             pixels+=line_size;\
 875             block +=line_size;\
 876         }\
 877 }\
 878 \
 879 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 886
 887 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 888 #else // 64 bit variant
 889
 890 #define PIXOP2(OPNAME, OP) \
 891 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 892     int i;\
 893     for(i=0; i<h; i++){\
 894         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 895         pixels+=line_size;\
 896         block +=line_size;\
 897     }\
 898 }\
 899 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 900     int i;\
 901     for(i=0; i<h; i++){\
 902         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 903         pixels+=line_size;\
 904         block +=line_size;\
 905     }\
 906 }\
 907 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 908     int i;\
 909     for(i=0; i<h; i++){\
 910         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 911         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 912         pixels+=line_size;\
 913         block +=line_size;\
 914     }\
 915 }\
 916 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 917     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 918 }\
 919 \
 920 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 921                                                 int src_stride1, int src_stride2, int h){\
 922     int i;\
 923     for(i=0; i<h; i++){\
 924         uint32_t a,b;\
 925         a= AV_RN32(&src1[i*src_stride1  ]);\
 926         b= AV_RN32(&src2[i*src_stride2  ]);\
 927         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 928         a= AV_RN32(&src1[i*src_stride1+4]);\
 929         b= AV_RN32(&src2[i*src_stride2+4]);\
 930         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 931     }\
 932 }\
 933 \
 934 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 935                                                 int src_stride1, int src_stride2, int h){\
 936     int i;\
 937     for(i=0; i<h; i++){\
 938         uint32_t a,b;\
 939         a= AV_RN32(&src1[i*src_stride1  ]);\
 940         b= AV_RN32(&src2[i*src_stride2  ]);\
 941         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 942         a= AV_RN32(&src1[i*src_stride1+4]);\
 943         b= AV_RN32(&src2[i*src_stride2+4]);\
 944         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 945     }\
 946 }\
 947 \
 948 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 949                                                 int src_stride1, int src_stride2, int h){\
 950     int i;\
 951     for(i=0; i<h; i++){\
 952         uint32_t a,b;\
 953         a= AV_RN32(&src1[i*src_stride1  ]);\
 954         b= AV_RN32(&src2[i*src_stride2  ]);\
 955         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 956     }\
 957 }\
 958 \
 959 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 960                                                 int src_stride1, int src_stride2, int h){\
 961     int i;\
 962     for(i=0; i<h; i++){\
 963         uint32_t a,b;\
 964         a= AV_RN16(&src1[i*src_stride1  ]);\
 965         b= AV_RN16(&src2[i*src_stride2  ]);\
 966         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 967     }\
 968 }\
 969 \
 970 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 971                                                 int src_stride1, int src_stride2, int h){\
 972     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 973     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 974 }\
 975 \
 976 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 977                                                 int src_stride1, int src_stride2, int h){\
 978     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 979     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 980 }\
 981 \
 982 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 983     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 984 }\
 985 \
 986 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 987     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 988 }\
 989 \
 990 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 991     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 992 }\
 993 \
 994 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 995     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 996 }\
 997 \
 998 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 999                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000     int i;\
1001     for(i=0; i<h; i++){\
1002         uint32_t a, b, c, d, l0, l1, h0, h1;\
1003         a= AV_RN32(&src1[i*src_stride1]);\
1004         b= AV_RN32(&src2[i*src_stride2]);\
1005         c= AV_RN32(&src3[i*src_stride3]);\
1006         d= AV_RN32(&src4[i*src_stride4]);\
1007         l0=  (a&0x03030303UL)\
1008            + (b&0x03030303UL)\
1009            + 0x02020202UL;\
1010         h0= ((a&0xFCFCFCFCUL)>>2)\
1011           + ((b&0xFCFCFCFCUL)>>2);\
1012         l1=  (c&0x03030303UL)\
1013            + (d&0x03030303UL);\
1014         h1= ((c&0xFCFCFCFCUL)>>2)\
1015           + ((d&0xFCFCFCFCUL)>>2);\
1016         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017         a= AV_RN32(&src1[i*src_stride1+4]);\
1018         b= AV_RN32(&src2[i*src_stride2+4]);\
1019         c= AV_RN32(&src3[i*src_stride3+4]);\
1020         d= AV_RN32(&src4[i*src_stride4+4]);\
1021         l0=  (a&0x03030303UL)\
1022            + (b&0x03030303UL)\
1023            + 0x02020202UL;\
1024         h0= ((a&0xFCFCFCFCUL)>>2)\
1025           + ((b&0xFCFCFCFCUL)>>2);\
1026         l1=  (c&0x03030303UL)\
1027            + (d&0x03030303UL);\
1028         h1= ((c&0xFCFCFCFCUL)>>2)\
1029           + ((d&0xFCFCFCFCUL)>>2);\
1030         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1031     }\
1032 }\
1033 \
1034 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1036 }\
1037 \
1038 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1040 }\
1041 \
1042 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1044 }\
1045 \
1046 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1048 }\
1049 \
1050 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052     int i;\
1053     for(i=0; i<h; i++){\
1054         uint32_t a, b, c, d, l0, l1, h0, h1;\
1055         a= AV_RN32(&src1[i*src_stride1]);\
1056         b= AV_RN32(&src2[i*src_stride2]);\
1057         c= AV_RN32(&src3[i*src_stride3]);\
1058         d= AV_RN32(&src4[i*src_stride4]);\
1059         l0=  (a&0x03030303UL)\
1060            + (b&0x03030303UL)\
1061            + 0x01010101UL;\
1062         h0= ((a&0xFCFCFCFCUL)>>2)\
1063           + ((b&0xFCFCFCFCUL)>>2);\
1064         l1=  (c&0x03030303UL)\
1065            + (d&0x03030303UL);\
1066         h1= ((c&0xFCFCFCFCUL)>>2)\
1067           + ((d&0xFCFCFCFCUL)>>2);\
1068         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069         a= AV_RN32(&src1[i*src_stride1+4]);\
1070         b= AV_RN32(&src2[i*src_stride2+4]);\
1071         c= AV_RN32(&src3[i*src_stride3+4]);\
1072         d= AV_RN32(&src4[i*src_stride4+4]);\
1073         l0=  (a&0x03030303UL)\
1074            + (b&0x03030303UL)\
1075            + 0x01010101UL;\
1076         h0= ((a&0xFCFCFCFCUL)>>2)\
1077           + ((b&0xFCFCFCFCUL)>>2);\
1078         l1=  (c&0x03030303UL)\
1079            + (d&0x03030303UL);\
1080         h1= ((c&0xFCFCFCFCUL)>>2)\
1081           + ((d&0xFCFCFCFCUL)>>2);\
1082         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083     }\
1084 }\
1085 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 }\
1090 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 }\
1095 \
1096 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1097 {\
1098         int i, a0, b0, a1, b1;\
1099         a0= pixels[0];\
1100         b0= pixels[1] + 2;\
1101         a0 += b0;\
1102         b0 += pixels[2];\
1103 \
1104         pixels+=line_size;\
1105         for(i=0; i<h; i+=2){\
1106             a1= pixels[0];\
1107             b1= pixels[1];\
1108             a1 += b1;\
1109             b1 += pixels[2];\
1110 \
1111             block[0]= (a1+a0)>>2; /* FIXME non put */\
1112             block[1]= (b1+b0)>>2;\
1113 \
1114             pixels+=line_size;\
1115             block +=line_size;\
1116 \
1117             a0= pixels[0];\
1118             b0= pixels[1] + 2;\
1119             a0 += b0;\
1120             b0 += pixels[2];\
1121 \
1122             block[0]= (a1+a0)>>2;\
1123             block[1]= (b1+b0)>>2;\
1124             pixels+=line_size;\
1125             block +=line_size;\
1126         }\
1127 }\
1128 \
1129 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1130 {\
1131         int i;\
1132         const uint32_t a= AV_RN32(pixels  );\
1133         const uint32_t b= AV_RN32(pixels+1);\
1134         uint32_t l0=  (a&0x03030303UL)\
1135                     + (b&0x03030303UL)\
1136                     + 0x02020202UL;\
1137         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138                    + ((b&0xFCFCFCFCUL)>>2);\
1139         uint32_t l1,h1;\
1140 \
1141         pixels+=line_size;\
1142         for(i=0; i<h; i+=2){\
1143             uint32_t a= AV_RN32(pixels  );\
1144             uint32_t b= AV_RN32(pixels+1);\
1145             l1=  (a&0x03030303UL)\
1146                + (b&0x03030303UL);\
1147             h1= ((a&0xFCFCFCFCUL)>>2)\
1148               + ((b&0xFCFCFCFCUL)>>2);\
1149             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150             pixels+=line_size;\
1151             block +=line_size;\
1152             a= AV_RN32(pixels  );\
1153             b= AV_RN32(pixels+1);\
1154             l0=  (a&0x03030303UL)\
1155                + (b&0x03030303UL)\
1156                + 0x02020202UL;\
1157             h0= ((a&0xFCFCFCFCUL)>>2)\
1158               + ((b&0xFCFCFCFCUL)>>2);\
1159             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160             pixels+=line_size;\
1161             block +=line_size;\
1162         }\
1163 }\
1164 \
1165 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1166 {\
1167     int j;\
1168     for(j=0; j<2; j++){\
1169         int i;\
1170         const uint32_t a= AV_RN32(pixels  );\
1171         const uint32_t b= AV_RN32(pixels+1);\
1172         uint32_t l0=  (a&0x03030303UL)\
1173                     + (b&0x03030303UL)\
1174                     + 0x02020202UL;\
1175         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176                    + ((b&0xFCFCFCFCUL)>>2);\
1177         uint32_t l1,h1;\
1178 \
1179         pixels+=line_size;\
1180         for(i=0; i<h; i+=2){\
1181             uint32_t a= AV_RN32(pixels  );\
1182             uint32_t b= AV_RN32(pixels+1);\
1183             l1=  (a&0x03030303UL)\
1184                + (b&0x03030303UL);\
1185             h1= ((a&0xFCFCFCFCUL)>>2)\
1186               + ((b&0xFCFCFCFCUL)>>2);\
1187             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188             pixels+=line_size;\
1189             block +=line_size;\
1190             a= AV_RN32(pixels  );\
1191             b= AV_RN32(pixels+1);\
1192             l0=  (a&0x03030303UL)\
1193                + (b&0x03030303UL)\
1194                + 0x02020202UL;\
1195             h0= ((a&0xFCFCFCFCUL)>>2)\
1196               + ((b&0xFCFCFCFCUL)>>2);\
1197             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198             pixels+=line_size;\
1199             block +=line_size;\
1200         }\
1201         pixels+=4-line_size*(h+1);\
1202         block +=4-line_size*h;\
1203     }\
1204 }\
1205 \
1206 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1207 {\
1208     int j;\
1209     for(j=0; j<2; j++){\
1210         int i;\
1211         const uint32_t a= AV_RN32(pixels  );\
1212         const uint32_t b= AV_RN32(pixels+1);\
1213         uint32_t l0=  (a&0x03030303UL)\
1214                     + (b&0x03030303UL)\
1215                     + 0x01010101UL;\
1216         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217                    + ((b&0xFCFCFCFCUL)>>2);\
1218         uint32_t l1,h1;\
1219 \
1220         pixels+=line_size;\
1221         for(i=0; i<h; i+=2){\
1222             uint32_t a= AV_RN32(pixels  );\
1223             uint32_t b= AV_RN32(pixels+1);\
1224             l1=  (a&0x03030303UL)\
1225                + (b&0x03030303UL);\
1226             h1= ((a&0xFCFCFCFCUL)>>2)\
1227               + ((b&0xFCFCFCFCUL)>>2);\
1228             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229             pixels+=line_size;\
1230             block +=line_size;\
1231             a= AV_RN32(pixels  );\
1232             b= AV_RN32(pixels+1);\
1233             l0=  (a&0x03030303UL)\
1234                + (b&0x03030303UL)\
1235                + 0x01010101UL;\
1236             h0= ((a&0xFCFCFCFCUL)>>2)\
1237               + ((b&0xFCFCFCFCUL)>>2);\
1238             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239             pixels+=line_size;\
1240             block +=line_size;\
1241         }\
1242         pixels+=4-line_size*(h+1);\
1243         block +=4-line_size*h;\
1244     }\
1245 }\
1246 \
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1255
1256 #define op_avg(a, b) a = rnd_avg32(a, b)
1257 #endif
1258 #define op_put(a, b) a = b
1259
1260 PIXOP2(avg, op_avg)
1261 PIXOP2(put, op_put)
1262 #undef op_avg
1263 #undef op_put
1264
1265 #define avg2(a,b) ((a+b+1)>>1)
1266 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1267
1268 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1270 }
1271
1272 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1274 }
1275
1276 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1277 {
1278     const int A=(16-x16)*(16-y16);
1279     const int B=(   x16)*(16-y16);
1280     const int C=(16-x16)*(   y16);
1281     const int D=(   x16)*(   y16);
1282     int i;
1283
1284     for(i=0; i<h; i++)
1285     {
1286         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294         dst+= stride;
1295         src+= stride;
1296     }
1297 }
1298
1299 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1301 {
1302     int y, vx, vy;
1303     const int s= 1<<shift;
1304
1305     width--;
1306     height--;
1307
1308     for(y=0; y<h; y++){
1309         int x;
1310
1311         vx= ox;
1312         vy= oy;
1313         for(x=0; x<8; x++){ //XXX FIXME optimize
1314             int src_x, src_y, frac_x, frac_y, index;
1315
1316             src_x= vx>>16;
1317             src_y= vy>>16;
1318             frac_x= src_x&(s-1);
1319             frac_y= src_y&(s-1);
1320             src_x>>=shift;
1321             src_y>>=shift;
1322
1323             if((unsigned)src_x < width){
1324                 if((unsigned)src_y < height){
1325                     index= src_x + src_y*stride;
1326                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1327                                            + src[index       +1]*   frac_x )*(s-frac_y)
1328                                         + (  src[index+stride  ]*(s-frac_x)
1329                                            + src[index+stride+1]*   frac_x )*   frac_y
1330                                         + r)>>(shift*2);
1331                 }else{
1332                     index= src_x + av_clip(src_y, 0, height)*stride;
1333                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1334                                           + src[index       +1]*   frac_x )*s
1335                                         + r)>>(shift*2);
1336                 }
1337             }else{
1338                 if((unsigned)src_y < height){
1339                     index= av_clip(src_x, 0, width) + src_y*stride;
1340                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1341                                            + src[index+stride  ]*   frac_y )*s
1342                                         + r)>>(shift*2);
1343                 }else{
1344                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345                     dst[y*stride + x]=    src[index         ];
1346                 }
1347             }
1348
1349             vx+= dxx;
1350             vy+= dyx;
1351         }
1352         ox += dxy;
1353         oy += dyy;
1354     }
1355 }
1356
1357 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358     switch(width){
1359     case 2: put_pixels2_c (dst, src, stride, height); break;
1360     case 4: put_pixels4_c (dst, src, stride, height); break;
1361     case 8: put_pixels8_c (dst, src, stride, height); break;
1362     case 16:put_pixels16_c(dst, src, stride, height); break;
1363     }
1364 }
1365
1366 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367     int i,j;
1368     for (i=0; i < height; i++) {
1369       for (j=0; j < width; j++) {
1370         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1371       }
1372       src += stride;
1373       dst += stride;
1374     }
1375 }
1376
1377 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378     int i,j;
1379     for (i=0; i < height; i++) {
1380       for (j=0; j < width; j++) {
1381         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1382       }
1383       src += stride;
1384       dst += stride;
1385     }
1386 }
1387
1388 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389     int i,j;
1390     for (i=0; i < height; i++) {
1391       for (j=0; j < width; j++) {
1392         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1393       }
1394       src += stride;
1395       dst += stride;
1396     }
1397 }
1398
1399 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400     int i,j;
1401     for (i=0; i < height; i++) {
1402       for (j=0; j < width; j++) {
1403         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1404       }
1405       src += stride;
1406       dst += stride;
1407     }
1408 }
1409
1410 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411     int i,j;
1412     for (i=0; i < height; i++) {
1413       for (j=0; j < width; j++) {
1414         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1415       }
1416       src += stride;
1417       dst += stride;
1418     }
1419 }
1420
1421 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422     int i,j;
1423     for (i=0; i < height; i++) {
1424       for (j=0; j < width; j++) {
1425         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1426       }
1427       src += stride;
1428       dst += stride;
1429     }
1430 }
1431
1432 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433     int i,j;
1434     for (i=0; i < height; i++) {
1435       for (j=0; j < width; j++) {
1436         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1437       }
1438       src += stride;
1439       dst += stride;
1440     }
1441 }
1442
1443 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444     int i,j;
1445     for (i=0; i < height; i++) {
1446       for (j=0; j < width; j++) {
1447         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1448       }
1449       src += stride;
1450       dst += stride;
1451     }
1452 }
1453
1454 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455     switch(width){
1456     case 2: avg_pixels2_c (dst, src, stride, height); break;
1457     case 4: avg_pixels4_c (dst, src, stride, height); break;
1458     case 8: avg_pixels8_c (dst, src, stride, height); break;
1459     case 16:avg_pixels16_c(dst, src, stride, height); break;
1460     }
1461 }
1462
1463 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464     int i,j;
1465     for (i=0; i < height; i++) {
1466       for (j=0; j < width; j++) {
1467         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1468       }
1469       src += stride;
1470       dst += stride;
1471     }
1472 }
1473
1474 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475     int i,j;
1476     for (i=0; i < height; i++) {
1477       for (j=0; j < width; j++) {
1478         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1479       }
1480       src += stride;
1481       dst += stride;
1482     }
1483 }
1484
1485 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486     int i,j;
1487     for (i=0; i < height; i++) {
1488       for (j=0; j < width; j++) {
1489         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1490       }
1491       src += stride;
1492       dst += stride;
1493     }
1494 }
1495
1496 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497     int i,j;
1498     for (i=0; i < height; i++) {
1499       for (j=0; j < width; j++) {
1500         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1501       }
1502       src += stride;
1503       dst += stride;
1504     }
1505 }
1506
1507 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508     int i,j;
1509     for (i=0; i < height; i++) {
1510       for (j=0; j < width; j++) {
1511         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1512       }
1513       src += stride;
1514       dst += stride;
1515     }
1516 }
1517
1518 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519     int i,j;
1520     for (i=0; i < height; i++) {
1521       for (j=0; j < width; j++) {
1522         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1523       }
1524       src += stride;
1525       dst += stride;
1526     }
1527 }
1528
1529 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530     int i,j;
1531     for (i=0; i < height; i++) {
1532       for (j=0; j < width; j++) {
1533         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1534       }
1535       src += stride;
1536       dst += stride;
1537     }
1538 }
1539
1540 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541     int i,j;
1542     for (i=0; i < height; i++) {
1543       for (j=0; j < width; j++) {
1544         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1545       }
1546       src += stride;
1547       dst += stride;
1548     }
1549 }
1550 #if 0
1551 #define TPEL_WIDTH(width)\
1552 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570 #endif
1571
1572 #define H264_CHROMA_MC(OPNAME, OP)\
1573 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574     const int A=(8-x)*(8-y);\
1575     const int B=(  x)*(8-y);\
1576     const int C=(8-x)*(  y);\
1577     const int D=(  x)*(  y);\
1578     int i;\
1579     \
1580     assert(x<8 && y<8 && x>=0 && y>=0);\
1581 \
1582     if(D){\
1583         for(i=0; i<h; i++){\
1584             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586             dst+= stride;\
1587             src+= stride;\
1588         }\
1589     }else{\
1590         const int E= B+C;\
1591         const int step= C ? stride : 1;\
1592         for(i=0; i<h; i++){\
1593             OP(dst[0], (A*src[0] + E*src[step+0]));\
1594             OP(dst[1], (A*src[1] + E*src[step+1]));\
1595             dst+= stride;\
1596             src+= stride;\
1597         }\
1598     }\
1599 }\
1600 \
1601 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602     const int A=(8-x)*(8-y);\
1603     const int B=(  x)*(8-y);\
1604     const int C=(8-x)*(  y);\
1605     const int D=(  x)*(  y);\
1606     int i;\
1607     \
1608     assert(x<8 && y<8 && x>=0 && y>=0);\
1609 \
1610     if(D){\
1611         for(i=0; i<h; i++){\
1612             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616             dst+= stride;\
1617             src+= stride;\
1618         }\
1619     }else{\
1620         const int E= B+C;\
1621         const int step= C ? stride : 1;\
1622         for(i=0; i<h; i++){\
1623             OP(dst[0], (A*src[0] + E*src[step+0]));\
1624             OP(dst[1], (A*src[1] + E*src[step+1]));\
1625             OP(dst[2], (A*src[2] + E*src[step+2]));\
1626             OP(dst[3], (A*src[3] + E*src[step+3]));\
1627             dst+= stride;\
1628             src+= stride;\
1629         }\
1630     }\
1631 }\
1632 \
1633 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634     const int A=(8-x)*(8-y);\
1635     const int B=(  x)*(8-y);\
1636     const int C=(8-x)*(  y);\
1637     const int D=(  x)*(  y);\
1638     int i;\
1639     \
1640     assert(x<8 && y<8 && x>=0 && y>=0);\
1641 \
1642     if(D){\
1643         for(i=0; i<h; i++){\
1644             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652             dst+= stride;\
1653             src+= stride;\
1654         }\
1655     }else{\
1656         const int E= B+C;\
1657         const int step= C ? stride : 1;\
1658         for(i=0; i<h; i++){\
1659             OP(dst[0], (A*src[0] + E*src[step+0]));\
1660             OP(dst[1], (A*src[1] + E*src[step+1]));\
1661             OP(dst[2], (A*src[2] + E*src[step+2]));\
1662             OP(dst[3], (A*src[3] + E*src[step+3]));\
1663             OP(dst[4], (A*src[4] + E*src[step+4]));\
1664             OP(dst[5], (A*src[5] + E*src[step+5]));\
1665             OP(dst[6], (A*src[6] + E*src[step+6]));\
1666             OP(dst[7], (A*src[7] + E*src[step+7]));\
1667             dst+= stride;\
1668             src+= stride;\
1669         }\
1670     }\
1671 }
1672
1673 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674 #define op_put(a, b) a = (((b) + 32)>>6)
1675
1676 H264_CHROMA_MC(put_       , op_put)
1677 H264_CHROMA_MC(avg_       , op_avg)
1678 #undef op_avg
1679 #undef op_put
1680
1681 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682     const int A=(8-x)*(8-y);
1683     const int B=(  x)*(8-y);
1684     const int C=(8-x)*(  y);
1685     const int D=(  x)*(  y);
1686     int i;
1687
1688     assert(x<8 && y<8 && x>=0 && y>=0);
1689
1690     for(i=0; i<h; i++)
1691     {
1692         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700         dst+= stride;
1701         src+= stride;
1702     }
1703 }
1704
1705 #define QPEL_MC(r, OPNAME, RND, OP) \
1706 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708     int i;\
1709     for(i=0; i<h; i++)\
1710     {\
1711         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719         dst+=dstStride;\
1720         src+=srcStride;\
1721     }\
1722 }\
1723 \
1724 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725     const int w=8;\
1726     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727     int i;\
1728     for(i=0; i<w; i++)\
1729     {\
1730         const int src0= src[0*srcStride];\
1731         const int src1= src[1*srcStride];\
1732         const int src2= src[2*srcStride];\
1733         const int src3= src[3*srcStride];\
1734         const int src4= src[4*srcStride];\
1735         const int src5= src[5*srcStride];\
1736         const int src6= src[6*srcStride];\
1737         const int src7= src[7*srcStride];\
1738         const int src8= src[8*srcStride];\
1739         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747         dst++;\
1748         src++;\
1749     }\
1750 }\
1751 \
1752 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754     int i;\
1755     \
1756     for(i=0; i<h; i++)\
1757     {\
1758         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774         dst+=dstStride;\
1775         src+=srcStride;\
1776     }\
1777 }\
1778 \
1779 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781     int i;\
1782     const int w=16;\
1783     for(i=0; i<w; i++)\
1784     {\
1785         const int src0= src[0*srcStride];\
1786         const int src1= src[1*srcStride];\
1787         const int src2= src[2*srcStride];\
1788         const int src3= src[3*srcStride];\
1789         const int src4= src[4*srcStride];\
1790         const int src5= src[5*srcStride];\
1791         const int src6= src[6*srcStride];\
1792         const int src7= src[7*srcStride];\
1793         const int src8= src[8*srcStride];\
1794         const int src9= src[9*srcStride];\
1795         const int src10= src[10*srcStride];\
1796         const int src11= src[11*srcStride];\
1797         const int src12= src[12*srcStride];\
1798         const int src13= src[13*srcStride];\
1799         const int src14= src[14*srcStride];\
1800         const int src15= src[15*srcStride];\
1801         const int src16= src[16*srcStride];\
1802         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818         dst++;\
1819         src++;\
1820     }\
1821 }\
1822 \
1823 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824     OPNAME ## pixels8_c(dst, src, stride, 8);\
1825 }\
1826 \
1827 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828     uint8_t half[64];\
1829     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1831 }\
1832 \
1833 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1835 }\
1836 \
1837 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838     uint8_t half[64];\
1839     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1841 }\
1842 \
1843 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844     uint8_t full[16*9];\
1845     uint8_t half[64];\
1846     copy_block9(full, src, 16, stride, 9);\
1847     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1849 }\
1850 \
1851 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852     uint8_t full[16*9];\
1853     copy_block9(full, src, 16, stride, 9);\
1854     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1855 }\
1856 \
1857 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858     uint8_t full[16*9];\
1859     uint8_t half[64];\
1860     copy_block9(full, src, 16, stride, 9);\
1861     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1863 }\
1864 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865     uint8_t full[16*9];\
1866     uint8_t halfH[72];\
1867     uint8_t halfV[64];\
1868     uint8_t halfHV[64];\
1869     copy_block9(full, src, 16, stride, 9);\
1870     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874 }\
1875 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876     uint8_t full[16*9];\
1877     uint8_t halfH[72];\
1878     uint8_t halfHV[64];\
1879     copy_block9(full, src, 16, stride, 9);\
1880     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1884 }\
1885 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886     uint8_t full[16*9];\
1887     uint8_t halfH[72];\
1888     uint8_t halfV[64];\
1889     uint8_t halfHV[64];\
1890     copy_block9(full, src, 16, stride, 9);\
1891     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1895 }\
1896 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897     uint8_t full[16*9];\
1898     uint8_t halfH[72];\
1899     uint8_t halfHV[64];\
1900     copy_block9(full, src, 16, stride, 9);\
1901     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1905 }\
1906 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907     uint8_t full[16*9];\
1908     uint8_t halfH[72];\
1909     uint8_t halfV[64];\
1910     uint8_t halfHV[64];\
1911     copy_block9(full, src, 16, stride, 9);\
1912     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1916 }\
1917 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918     uint8_t full[16*9];\
1919     uint8_t halfH[72];\
1920     uint8_t halfHV[64];\
1921     copy_block9(full, src, 16, stride, 9);\
1922     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1926 }\
1927 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928     uint8_t full[16*9];\
1929     uint8_t halfH[72];\
1930     uint8_t halfV[64];\
1931     uint8_t halfHV[64];\
1932     copy_block9(full, src, 16, stride, 9);\
1933     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1934     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1937 }\
1938 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939     uint8_t full[16*9];\
1940     uint8_t halfH[72];\
1941     uint8_t halfHV[64];\
1942     copy_block9(full, src, 16, stride, 9);\
1943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1947 }\
1948 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t halfH[72];\
1950     uint8_t halfHV[64];\
1951     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1954 }\
1955 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t halfH[72];\
1957     uint8_t halfHV[64];\
1958     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1961 }\
1962 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963     uint8_t full[16*9];\
1964     uint8_t halfH[72];\
1965     uint8_t halfV[64];\
1966     uint8_t halfHV[64];\
1967     copy_block9(full, src, 16, stride, 9);\
1968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1972 }\
1973 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t full[16*9];\
1975     uint8_t halfH[72];\
1976     copy_block9(full, src, 16, stride, 9);\
1977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1980 }\
1981 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982     uint8_t full[16*9];\
1983     uint8_t halfH[72];\
1984     uint8_t halfV[64];\
1985     uint8_t halfHV[64];\
1986     copy_block9(full, src, 16, stride, 9);\
1987     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1991 }\
1992 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993     uint8_t full[16*9];\
1994     uint8_t halfH[72];\
1995     copy_block9(full, src, 16, stride, 9);\
1996     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1999 }\
2000 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t halfH[72];\
2002     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2004 }\
2005 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006     OPNAME ## pixels16_c(dst, src, stride, 16);\
2007 }\
2008 \
2009 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010     uint8_t half[256];\
2011     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2013 }\
2014 \
2015 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2017 }\
2018 \
2019 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020     uint8_t half[256];\
2021     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2023 }\
2024 \
2025 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t full[24*17];\
2027     uint8_t half[256];\
2028     copy_block17(full, src, 24, stride, 17);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2031 }\
2032 \
2033 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034     uint8_t full[24*17];\
2035     copy_block17(full, src, 24, stride, 17);\
2036     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2037 }\
2038 \
2039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040     uint8_t full[24*17];\
2041     uint8_t half[256];\
2042     copy_block17(full, src, 24, stride, 17);\
2043     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2045 }\
2046 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047     uint8_t full[24*17];\
2048     uint8_t halfH[272];\
2049     uint8_t halfV[256];\
2050     uint8_t halfHV[256];\
2051     copy_block17(full, src, 24, stride, 17);\
2052     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2056 }\
2057 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058     uint8_t full[24*17];\
2059     uint8_t halfH[272];\
2060     uint8_t halfHV[256];\
2061     copy_block17(full, src, 24, stride, 17);\
2062     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2066 }\
2067 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068     uint8_t full[24*17];\
2069     uint8_t halfH[272];\
2070     uint8_t halfV[256];\
2071     uint8_t halfHV[256];\
2072     copy_block17(full, src, 24, stride, 17);\
2073     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2077 }\
2078 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079     uint8_t full[24*17];\
2080     uint8_t halfH[272];\
2081     uint8_t halfHV[256];\
2082     copy_block17(full, src, 24, stride, 17);\
2083     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2087 }\
2088 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089     uint8_t full[24*17];\
2090     uint8_t halfH[272];\
2091     uint8_t halfV[256];\
2092     uint8_t halfHV[256];\
2093     copy_block17(full, src, 24, stride, 17);\
2094     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2098 }\
2099 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100     uint8_t full[24*17];\
2101     uint8_t halfH[272];\
2102     uint8_t halfHV[256];\
2103     copy_block17(full, src, 24, stride, 17);\
2104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2108 }\
2109 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110     uint8_t full[24*17];\
2111     uint8_t halfH[272];\
2112     uint8_t halfV[256];\
2113     uint8_t halfHV[256];\
2114     copy_block17(full, src, 24, stride, 17);\
2115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2116     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2119 }\
2120 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121     uint8_t full[24*17];\
2122     uint8_t halfH[272];\
2123     uint8_t halfHV[256];\
2124     copy_block17(full, src, 24, stride, 17);\
2125     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2129 }\
2130 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131     uint8_t halfH[272];\
2132     uint8_t halfHV[256];\
2133     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2136 }\
2137 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138     uint8_t halfH[272];\
2139     uint8_t halfHV[256];\
2140     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2143 }\
2144 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145     uint8_t full[24*17];\
2146     uint8_t halfH[272];\
2147     uint8_t halfV[256];\
2148     uint8_t halfHV[256];\
2149     copy_block17(full, src, 24, stride, 17);\
2150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2154 }\
2155 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156     uint8_t full[24*17];\
2157     uint8_t halfH[272];\
2158     copy_block17(full, src, 24, stride, 17);\
2159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2162 }\
2163 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164     uint8_t full[24*17];\
2165     uint8_t halfH[272];\
2166     uint8_t halfV[256];\
2167     uint8_t halfHV[256];\
2168     copy_block17(full, src, 24, stride, 17);\
2169     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2173 }\
2174 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175     uint8_t full[24*17];\
2176     uint8_t halfH[272];\
2177     copy_block17(full, src, 24, stride, 17);\
2178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2181 }\
2182 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183     uint8_t halfH[272];\
2184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2186 }
2187
2188 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190 #define op_put(a, b) a = cm[((b) + 16)>>5]
2191 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2192
2193 QPEL_MC(0, put_       , _       , op_put)
2194 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195 QPEL_MC(0, avg_       , _       , op_avg)
2196 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2197 #undef op_avg
2198 #undef op_avg_no_rnd
2199 #undef op_put
2200 #undef op_put_no_rnd
2201
2202 #if 1
2203 #define H264_LOWPASS(OPNAME, OP, OP2) \
2204 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205     const int h=2;\
2206     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207     int i;\
2208     for(i=0; i<h; i++)\
2209     {\
2210         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212         dst+=dstStride;\
2213         src+=srcStride;\
2214     }\
2215 }\
2216 \
2217 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218     const int w=2;\
2219     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220     int i;\
2221     for(i=0; i<w; i++)\
2222     {\
2223         const int srcB= src[-2*srcStride];\
2224         const int srcA= src[-1*srcStride];\
2225         const int src0= src[0 *srcStride];\
2226         const int src1= src[1 *srcStride];\
2227         const int src2= src[2 *srcStride];\
2228         const int src3= src[3 *srcStride];\
2229         const int src4= src[4 *srcStride];\
2230         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232         dst++;\
2233         src++;\
2234     }\
2235 }\
2236 \
2237 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238     const int h=2;\
2239     const int w=2;\
2240     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241     int i;\
2242     src -= 2*srcStride;\
2243     for(i=0; i<h+5; i++)\
2244     {\
2245         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247         tmp+=tmpStride;\
2248         src+=srcStride;\
2249     }\
2250     tmp -= tmpStride*(h+5-2);\
2251     for(i=0; i<w; i++)\
2252     {\
2253         const int tmpB= tmp[-2*tmpStride];\
2254         const int tmpA= tmp[-1*tmpStride];\
2255         const int tmp0= tmp[0 *tmpStride];\
2256         const int tmp1= tmp[1 *tmpStride];\
2257         const int tmp2= tmp[2 *tmpStride];\
2258         const int tmp3= tmp[3 *tmpStride];\
2259         const int tmp4= tmp[4 *tmpStride];\
2260         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262         dst++;\
2263         tmp++;\
2264     }\
2265 }\
2266 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267     const int h=4;\
2268     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269     int i;\
2270     for(i=0; i<h; i++)\
2271     {\
2272         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276         dst+=dstStride;\
2277         src+=srcStride;\
2278     }\
2279 }\
2280 \
2281 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282     const int w=4;\
2283     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284     int i;\
2285     for(i=0; i<w; i++)\
2286     {\
2287         const int srcB= src[-2*srcStride];\
2288         const int srcA= src[-1*srcStride];\
2289         const int src0= src[0 *srcStride];\
2290         const int src1= src[1 *srcStride];\
2291         const int src2= src[2 *srcStride];\
2292         const int src3= src[3 *srcStride];\
2293         const int src4= src[4 *srcStride];\
2294         const int src5= src[5 *srcStride];\
2295         const int src6= src[6 *srcStride];\
2296         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300         dst++;\
2301         src++;\
2302     }\
2303 }\
2304 \
2305 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306     const int h=4;\
2307     const int w=4;\
2308     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309     int i;\
2310     src -= 2*srcStride;\
2311     for(i=0; i<h+5; i++)\
2312     {\
2313         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317         tmp+=tmpStride;\
2318         src+=srcStride;\
2319     }\
2320     tmp -= tmpStride*(h+5-2);\
2321     for(i=0; i<w; i++)\
2322     {\
2323         const int tmpB= tmp[-2*tmpStride];\
2324         const int tmpA= tmp[-1*tmpStride];\
2325         const int tmp0= tmp[0 *tmpStride];\
2326         const int tmp1= tmp[1 *tmpStride];\
2327         const int tmp2= tmp[2 *tmpStride];\
2328         const int tmp3= tmp[3 *tmpStride];\
2329         const int tmp4= tmp[4 *tmpStride];\
2330         const int tmp5= tmp[5 *tmpStride];\
2331         const int tmp6= tmp[6 *tmpStride];\
2332         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336         dst++;\
2337         tmp++;\
2338     }\
2339 }\
2340 \
2341 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342     const int h=8;\
2343     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344     int i;\
2345     for(i=0; i<h; i++)\
2346     {\
2347         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355         dst+=dstStride;\
2356         src+=srcStride;\
2357     }\
2358 }\
2359 \
2360 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361     const int w=8;\
2362     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363     int i;\
2364     for(i=0; i<w; i++)\
2365     {\
2366         const int srcB= src[-2*srcStride];\
2367         const int srcA= src[-1*srcStride];\
2368         const int src0= src[0 *srcStride];\
2369         const int src1= src[1 *srcStride];\
2370         const int src2= src[2 *srcStride];\
2371         const int src3= src[3 *srcStride];\
2372         const int src4= src[4 *srcStride];\
2373         const int src5= src[5 *srcStride];\
2374         const int src6= src[6 *srcStride];\
2375         const int src7= src[7 *srcStride];\
2376         const int src8= src[8 *srcStride];\
2377         const int src9= src[9 *srcStride];\
2378         const int src10=src[10*srcStride];\
2379         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387         dst++;\
2388         src++;\
2389     }\
2390 }\
2391 \
2392 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393     const int h=8;\
2394     const int w=8;\
2395     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396     int i;\
2397     src -= 2*srcStride;\
2398     for(i=0; i<h+5; i++)\
2399     {\
2400         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408         tmp+=tmpStride;\
2409         src+=srcStride;\
2410     }\
2411     tmp -= tmpStride*(h+5-2);\
2412     for(i=0; i<w; i++)\
2413     {\
2414         const int tmpB= tmp[-2*tmpStride];\
2415         const int tmpA= tmp[-1*tmpStride];\
2416         const int tmp0= tmp[0 *tmpStride];\
2417         const int tmp1= tmp[1 *tmpStride];\
2418         const int tmp2= tmp[2 *tmpStride];\
2419         const int tmp3= tmp[3 *tmpStride];\
2420         const int tmp4= tmp[4 *tmpStride];\
2421         const int tmp5= tmp[5 *tmpStride];\
2422         const int tmp6= tmp[6 *tmpStride];\
2423         const int tmp7= tmp[7 *tmpStride];\
2424         const int tmp8= tmp[8 *tmpStride];\
2425         const int tmp9= tmp[9 *tmpStride];\
2426         const int tmp10=tmp[10*tmpStride];\
2427         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435         dst++;\
2436         tmp++;\
2437     }\
2438 }\
2439 \
2440 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2442     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443     src += 8*srcStride;\
2444     dst += 8*dstStride;\
2445     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2446     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2447 }\
2448 \
2449 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2451     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452     src += 8*srcStride;\
2453     dst += 8*dstStride;\
2454     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2455     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2456 }\
2457 \
2458 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2460     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461     src += 8*srcStride;\
2462     dst += 8*dstStride;\
2463     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2464     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2465 }\
2466
2467 #define H264_MC(OPNAME, SIZE) \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2470 }\
2471 \
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473     uint8_t half[SIZE*SIZE];\
2474     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2480 }\
2481 \
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483     uint8_t half[SIZE*SIZE];\
2484     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2486 }\
2487 \
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489     uint8_t full[SIZE*(SIZE+5)];\
2490     uint8_t * const full_mid= full + SIZE*2;\
2491     uint8_t half[SIZE*SIZE];\
2492     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2493     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2495 }\
2496 \
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498     uint8_t full[SIZE*(SIZE+5)];\
2499     uint8_t * const full_mid= full + SIZE*2;\
2500     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2501     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2502 }\
2503 \
2504 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505     uint8_t full[SIZE*(SIZE+5)];\
2506     uint8_t * const full_mid= full + SIZE*2;\
2507     uint8_t half[SIZE*SIZE];\
2508     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2509     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2511 }\
2512 \
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514     uint8_t full[SIZE*(SIZE+5)];\
2515     uint8_t * const full_mid= full + SIZE*2;\
2516     uint8_t halfH[SIZE*SIZE];\
2517     uint8_t halfV[SIZE*SIZE];\
2518     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2520     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2522 }\
2523 \
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525     uint8_t full[SIZE*(SIZE+5)];\
2526     uint8_t * const full_mid= full + SIZE*2;\
2527     uint8_t halfH[SIZE*SIZE];\
2528     uint8_t halfV[SIZE*SIZE];\
2529     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2531     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2533 }\
2534 \
2535 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536     uint8_t full[SIZE*(SIZE+5)];\
2537     uint8_t * const full_mid= full + SIZE*2;\
2538     uint8_t halfH[SIZE*SIZE];\
2539     uint8_t halfV[SIZE*SIZE];\
2540     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2542     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2544 }\
2545 \
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547     uint8_t full[SIZE*(SIZE+5)];\
2548     uint8_t * const full_mid= full + SIZE*2;\
2549     uint8_t halfH[SIZE*SIZE];\
2550     uint8_t halfV[SIZE*SIZE];\
2551     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2553     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2555 }\
2556 \
2557 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558     int16_t tmp[SIZE*(SIZE+5)];\
2559     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2560 }\
2561 \
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563     int16_t tmp[SIZE*(SIZE+5)];\
2564     uint8_t halfH[SIZE*SIZE];\
2565     uint8_t halfHV[SIZE*SIZE];\
2566     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2569 }\
2570 \
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572     int16_t tmp[SIZE*(SIZE+5)];\
2573     uint8_t halfH[SIZE*SIZE];\
2574     uint8_t halfHV[SIZE*SIZE];\
2575     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2578 }\
2579 \
2580 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581     uint8_t full[SIZE*(SIZE+5)];\
2582     uint8_t * const full_mid= full + SIZE*2;\
2583     int16_t tmp[SIZE*(SIZE+5)];\
2584     uint8_t halfV[SIZE*SIZE];\
2585     uint8_t halfHV[SIZE*SIZE];\
2586     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2587     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2590 }\
2591 \
2592 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593     uint8_t full[SIZE*(SIZE+5)];\
2594     uint8_t * const full_mid= full + SIZE*2;\
2595     int16_t tmp[SIZE*(SIZE+5)];\
2596     uint8_t halfV[SIZE*SIZE];\
2597     uint8_t halfHV[SIZE*SIZE];\
2598     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2599     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2602 }\
2603
2604 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2607 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2609
2610 H264_LOWPASS(put_       , op_put, op2_put)
2611 H264_LOWPASS(avg_       , op_avg, op2_avg)
2612 H264_MC(put_, 2)
2613 H264_MC(put_, 4)
2614 H264_MC(put_, 8)
2615 H264_MC(put_, 16)
2616 H264_MC(avg_, 4)
2617 H264_MC(avg_, 8)
2618 H264_MC(avg_, 16)
2619
2620 #undef op_avg
2621 #undef op_put
2622 #undef op2_avg
2623 #undef op2_put
2624 #endif
2625
2626 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628 #define H264_WEIGHT(W,H) \
2629 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630     int y; \
2631     offset <<= log2_denom; \
2632     if(log2_denom) offset += 1<<(log2_denom-1); \
2633     for(y=0; y<H; y++, block += stride){ \
2634         op_scale1(0); \
2635         op_scale1(1); \
2636         if(W==2) continue; \
2637         op_scale1(2); \
2638         op_scale1(3); \
2639         if(W==4) continue; \
2640         op_scale1(4); \
2641         op_scale1(5); \
2642         op_scale1(6); \
2643         op_scale1(7); \
2644         if(W==8) continue; \
2645         op_scale1(8); \
2646         op_scale1(9); \
2647         op_scale1(10); \
2648         op_scale1(11); \
2649         op_scale1(12); \
2650         op_scale1(13); \
2651         op_scale1(14); \
2652         op_scale1(15); \
2653     } \
2654 } \
2655 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656     int y; \
2657     offset = ((offset + 1) | 1) << log2_denom; \
2658     for(y=0; y<H; y++, dst += stride, src += stride){ \
2659         op_scale2(0); \
2660         op_scale2(1); \
2661         if(W==2) continue; \
2662         op_scale2(2); \
2663         op_scale2(3); \
2664         if(W==4) continue; \
2665         op_scale2(4); \
2666         op_scale2(5); \
2667         op_scale2(6); \
2668         op_scale2(7); \
2669         if(W==8) continue; \
2670         op_scale2(8); \
2671         op_scale2(9); \
2672         op_scale2(10); \
2673         op_scale2(11); \
2674         op_scale2(12); \
2675         op_scale2(13); \
2676         op_scale2(14); \
2677         op_scale2(15); \
2678     } \
2679 }
2680
2681 H264_WEIGHT(16,16)
2682 H264_WEIGHT(16,8)
2683 H264_WEIGHT(8,16)
2684 H264_WEIGHT(8,8)
2685 H264_WEIGHT(8,4)
2686 H264_WEIGHT(4,8)
2687 H264_WEIGHT(4,4)
2688 H264_WEIGHT(4,2)
2689 H264_WEIGHT(2,4)
2690 H264_WEIGHT(2,2)
2691
2692 #undef op_scale1
2693 #undef op_scale2
2694 #undef H264_WEIGHT
2695
2696 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698     int i;
2699
2700     for(i=0; i<h; i++){
2701         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709         dst+=dstStride;
2710         src+=srcStride;
2711     }
2712 }
2713
2714 #ifdef CONFIG_CAVS_DECODER
2715 /* AVS specific */
2716 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2717
2718 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719     put_pixels8_c(dst, src, stride, 8);
2720 }
2721 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722     avg_pixels8_c(dst, src, stride, 8);
2723 }
2724 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725     put_pixels16_c(dst, src, stride, 16);
2726 }
2727 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728     avg_pixels16_c(dst, src, stride, 16);
2729 }
2730 #endif /* CONFIG_CAVS_DECODER */
2731
2732 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2733 /* VC-1 specific */
2734 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2735
2736 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737     put_pixels8_c(dst, src, stride, 8);
2738 }
2739 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2740
2741 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2742
2743 /* H264 specific */
2744 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2745
2746 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2747     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2748     int i;
2749
2750     for(i=0; i<w; i++){
2751         const int src_1= src[ -srcStride];
2752         const int src0 = src[0          ];
2753         const int src1 = src[  srcStride];
2754         const int src2 = src[2*srcStride];
2755         const int src3 = src[3*srcStride];
2756         const int src4 = src[4*srcStride];
2757         const int src5 = src[5*srcStride];
2758         const int src6 = src[6*srcStride];
2759         const int src7 = src[7*srcStride];
2760         const int src8 = src[8*srcStride];
2761         const int src9 = src[9*srcStride];
2762         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2763         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2764         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2765         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2766         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2767         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2768         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2769         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2770         src++;
2771         dst++;
2772     }
2773 }
2774
2775 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2776     put_pixels8_c(dst, src, stride, 8);
2777 }
2778
2779 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2780     uint8_t half[64];
2781     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2782     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2783 }
2784
2785 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2786     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2787 }
2788
2789 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2790     uint8_t half[64];
2791     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2792     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2793 }
2794
2795 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2796     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2797 }
2798
2799 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2800     uint8_t halfH[88];
2801     uint8_t halfV[64];
2802     uint8_t halfHV[64];
2803     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2804     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2805     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2806     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2807 }
2808 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2809     uint8_t halfH[88];
2810     uint8_t halfV[64];
2811     uint8_t halfHV[64];
2812     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2813     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2814     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2815     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2816 }
2817 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2818     uint8_t halfH[88];
2819     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2820     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2821 }
2822
2823 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2824     if(ENABLE_ANY_H263) {
2825     int x;
2826     const int strength= ff_h263_loop_filter_strength[qscale];
2827
2828     for(x=0; x<8; x++){
2829         int d1, d2, ad1;
2830         int p0= src[x-2*stride];
2831         int p1= src[x-1*stride];
2832         int p2= src[x+0*stride];
2833         int p3= src[x+1*stride];
2834         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2835
2836         if     (d<-2*strength) d1= 0;
2837         else if(d<-  strength) d1=-2*strength - d;
2838         else if(d<   strength) d1= d;
2839         else if(d< 2*strength) d1= 2*strength - d;
2840         else                   d1= 0;
2841
2842         p1 += d1;
2843         p2 -= d1;
2844         if(p1&256) p1= ~(p1>>31);
2845         if(p2&256) p2= ~(p2>>31);
2846
2847         src[x-1*stride] = p1;
2848         src[x+0*stride] = p2;
2849
2850         ad1= FFABS(d1)>>1;
2851
2852         d2= av_clip((p0-p3)/4, -ad1, ad1);
2853
2854         src[x-2*stride] = p0 - d2;
2855         src[x+  stride] = p3 + d2;
2856     }
2857     }
2858 }
2859
2860 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2861     if(ENABLE_ANY_H263) {
2862     int y;
2863     const int strength= ff_h263_loop_filter_strength[qscale];
2864
2865     for(y=0; y<8; y++){
2866         int d1, d2, ad1;
2867         int p0= src[y*stride-2];
2868         int p1= src[y*stride-1];
2869         int p2= src[y*stride+0];
2870         int p3= src[y*stride+1];
2871         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2872
2873         if     (d<-2*strength) d1= 0;
2874         else if(d<-  strength) d1=-2*strength - d;
2875         else if(d<   strength) d1= d;
2876         else if(d< 2*strength) d1= 2*strength - d;
2877         else                   d1= 0;
2878
2879         p1 += d1;
2880         p2 -= d1;
2881         if(p1&256) p1= ~(p1>>31);
2882         if(p2&256) p2= ~(p2>>31);
2883
2884         src[y*stride-1] = p1;
2885         src[y*stride+0] = p2;
2886
2887         ad1= FFABS(d1)>>1;
2888
2889         d2= av_clip((p0-p3)/4, -ad1, ad1);
2890
2891         src[y*stride-2] = p0 - d2;
2892         src[y*stride+1] = p3 + d2;
2893     }
2894     }
2895 }
2896
2897 static void h261_loop_filter_c(uint8_t *src, int stride){
2898     int x,y,xy,yz;
2899     int temp[64];
2900
2901     for(x=0; x<8; x++){
2902         temp[x      ] = 4*src[x           ];
2903         temp[x + 7*8] = 4*src[x + 7*stride];
2904     }
2905     for(y=1; y<7; y++){
2906         for(x=0; x<8; x++){
2907             xy = y * stride + x;
2908             yz = y * 8 + x;
2909             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2910         }
2911     }
2912
2913     for(y=0; y<8; y++){
2914         src[  y*stride] = (temp[  y*8] + 2)>>2;
2915         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2916         for(x=1; x<7; x++){
2917             xy = y * stride + x;
2918             yz = y * 8 + x;
2919             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2920         }
2921     }
2922 }
2923
2924 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2925 {
2926     int i, d;
2927     for( i = 0; i < 4; i++ ) {
2928         if( tc0[i] < 0 ) {
2929             pix += 4*ystride;
2930             continue;
2931         }
2932         for( d = 0; d < 4; d++ ) {
2933             const int p0 = pix[-1*xstride];
2934             const int p1 = pix[-2*xstride];
2935             const int p2 = pix[-3*xstride];
2936             const int q0 = pix[0];
2937             const int q1 = pix[1*xstride];
2938             const int q2 = pix[2*xstride];
2939
2940             if( FFABS( p0 - q0 ) < alpha &&
2941                 FFABS( p1 - p0 ) < beta &&
2942                 FFABS( q1 - q0 ) < beta ) {
2943
2944                 int tc = tc0[i];
2945                 int i_delta;
2946
2947                 if( FFABS( p2 - p0 ) < beta ) {
2948                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2949                     tc++;
2950                 }
2951                 if( FFABS( q2 - q0 ) < beta ) {
2952                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2953                     tc++;
2954                 }
2955
2956                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2957                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2958                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2959             }
2960             pix += ystride;
2961         }
2962     }
2963 }
2964 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2965 {
2966     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2967 }
2968 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2969 {
2970     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2971 }
2972
2973 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2974 {
2975     int i, d;
2976     for( i = 0; i < 4; i++ ) {
2977         const int tc = tc0[i];
2978         if( tc <= 0 ) {
2979             pix += 2*ystride;
2980             continue;
2981         }
2982         for( d = 0; d < 2; d++ ) {
2983             const int p0 = pix[-1*xstride];
2984             const int p1 = pix[-2*xstride];
2985             const int q0 = pix[0];
2986             const int q1 = pix[1*xstride];
2987
2988             if( FFABS( p0 - q0 ) < alpha &&
2989                 FFABS( p1 - p0 ) < beta &&
2990                 FFABS( q1 - q0 ) < beta ) {
2991
2992                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2993
2994                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
2995                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
2996             }
2997             pix += ystride;
2998         }
2999     }
3000 }
3001 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3002 {
3003     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3004 }
3005 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3006 {
3007     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3008 }
3009
3010 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3011 {
3012     int d;
3013     for( d = 0; d < 8; d++ ) {
3014         const int p0 = pix[-1*xstride];
3015         const int p1 = pix[-2*xstride];
3016         const int q0 = pix[0];
3017         const int q1 = pix[1*xstride];
3018
3019         if( FFABS( p0 - q0 ) < alpha &&
3020             FFABS( p1 - p0 ) < beta &&
3021             FFABS( q1 - q0 ) < beta ) {
3022
3023             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3024             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3025         }
3026         pix += ystride;
3027     }
3028 }
3029 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3030 {
3031     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3032 }
3033 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3034 {
3035     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3036 }
3037
3038 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3039 {
3040     int s, i;
3041
3042     s = 0;
3043     for(i=0;i<h;i++) {
3044         s += abs(pix1[0] - pix2[0]);
3045         s += abs(pix1[1] - pix2[1]);
3046         s += abs(pix1[2] - pix2[2]);
3047         s += abs(pix1[3] - pix2[3]);
3048         s += abs(pix1[4] - pix2[4]);
3049         s += abs(pix1[5] - pix2[5]);
3050         s += abs(pix1[6] - pix2[6]);
3051         s += abs(pix1[7] - pix2[7]);
3052         s += abs(pix1[8] - pix2[8]);
3053         s += abs(pix1[9] - pix2[9]);
3054         s += abs(pix1[10] - pix2[10]);
3055         s += abs(pix1[11] - pix2[11]);
3056         s += abs(pix1[12] - pix2[12]);
3057         s += abs(pix1[13] - pix2[13]);
3058         s += abs(pix1[14] - pix2[14]);
3059         s += abs(pix1[15] - pix2[15]);
3060         pix1 += line_size;
3061         pix2 += line_size;
3062     }
3063     return s;
3064 }
3065
3066 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3067 {
3068     int s, i;
3069
3070     s = 0;
3071     for(i=0;i<h;i++) {
3072         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3073         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3074         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3075         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3076         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3077         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3078         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3079         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3080         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3081         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3082         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3083         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3084         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3085         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3086         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3087         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3088         pix1 += line_size;
3089         pix2 += line_size;
3090     }
3091     return s;
3092 }
3093
3094 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3095 {
3096     int s, i;
3097     uint8_t *pix3 = pix2 + line_size;
3098
3099     s = 0;
3100     for(i=0;i<h;i++) {
3101         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3102         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3103         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3104         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3105         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3106         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3107         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3108         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3109         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3110         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3111         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3112         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3113         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3114         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3115         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3116         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3117         pix1 += line_size;
3118         pix2 += line_size;
3119         pix3 += line_size;
3120     }
3121     return s;
3122 }
3123
3124 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3125 {
3126     int s, i;
3127     uint8_t *pix3 = pix2 + line_size;
3128
3129     s = 0;
3130     for(i=0;i<h;i++) {
3131         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3132         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3133         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3134         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3135         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3136         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3137         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3138         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3139         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3140         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3141         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3142         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3143         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3144         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3145         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3146         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3147         pix1 += line_size;
3148         pix2 += line_size;
3149         pix3 += line_size;
3150     }
3151     return s;
3152 }
3153
3154 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3155 {
3156     int s, i;
3157
3158     s = 0;
3159     for(i=0;i<h;i++) {
3160         s += abs(pix1[0] - pix2[0]);
3161         s += abs(pix1[1] - pix2[1]);
3162         s += abs(pix1[2] - pix2[2]);
3163         s += abs(pix1[3] - pix2[3]);
3164         s += abs(pix1[4] - pix2[4]);
3165         s += abs(pix1[5] - pix2[5]);
3166         s += abs(pix1[6] - pix2[6]);
3167         s += abs(pix1[7] - pix2[7]);
3168         pix1 += line_size;
3169         pix2 += line_size;
3170     }
3171     return s;
3172 }
3173
3174 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3175 {
3176     int s, i;
3177
3178     s = 0;
3179     for(i=0;i<h;i++) {
3180         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3181         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3182         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3183         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3184         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3185         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3186         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3187         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3188         pix1 += line_size;
3189         pix2 += line_size;
3190     }
3191     return s;
3192 }
3193
3194 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3195 {
3196     int s, i;
3197     uint8_t *pix3 = pix2 + line_size;
3198
3199     s = 0;
3200     for(i=0;i<h;i++) {
3201         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3202         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3203         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3204         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3205         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3206         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3207         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3208         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3209         pix1 += line_size;
3210         pix2 += line_size;
3211         pix3 += line_size;
3212     }
3213     return s;
3214 }
3215
3216 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3217 {
3218     int s, i;
3219     uint8_t *pix3 = pix2 + line_size;
3220
3221     s = 0;
3222     for(i=0;i<h;i++) {
3223         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3224         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3225         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3226         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3227         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3228         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3229         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3230         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3231         pix1 += line_size;
3232         pix2 += line_size;
3233         pix3 += line_size;
3234     }
3235     return s;
3236 }
3237
3238 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3239     MpegEncContext *c = v;
3240     int score1=0;
3241     int score2=0;
3242     int x,y;
3243
3244     for(y=0; y<h; y++){
3245         for(x=0; x<16; x++){
3246             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3247         }
3248         if(y+1<h){
3249             for(x=0; x<15; x++){
3250                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3251                              - s1[x+1] + s1[x+1+stride])
3252                         -FFABS(  s2[x  ] - s2[x  +stride]
3253                              - s2[x+1] + s2[x+1+stride]);
3254             }
3255         }
3256         s1+= stride;
3257         s2+= stride;
3258     }
3259
3260     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3261     else  return score1 + FFABS(score2)*8;
3262 }
3263
3264 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3265     MpegEncContext *c = v;
3266     int score1=0;
3267     int score2=0;
3268     int x,y;
3269
3270     for(y=0; y<h; y++){
3271         for(x=0; x<8; x++){
3272             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3273         }
3274         if(y+1<h){
3275             for(x=0; x<7; x++){
3276                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3277                              - s1[x+1] + s1[x+1+stride])
3278                         -FFABS(  s2[x  ] - s2[x  +stride]
3279                              - s2[x+1] + s2[x+1+stride]);
3280             }
3281         }
3282         s1+= stride;
3283         s2+= stride;
3284     }
3285
3286     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3287     else  return score1 + FFABS(score2)*8;
3288 }
3289
3290 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3291     int i;
3292     unsigned int sum=0;
3293
3294     for(i=0; i<8*8; i++){
3295         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3296         int w= weight[i];
3297         b>>= RECON_SHIFT;
3298         assert(-512<b && b<512);
3299
3300         sum += (w*b)*(w*b)>>4;
3301     }
3302     return sum>>2;
3303 }
3304
3305 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3306     int i;
3307
3308     for(i=0; i<8*8; i++){
3309         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3310     }
3311 }
3312
3313 /**
3314  * permutes an 8x8 block.
3315  * @param block the block which will be permuted according to the given permutation vector
3316  * @param permutation the permutation vector
3317  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3318  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3319  *                  (inverse) permutated to scantable order!
3320  */
3321 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3322 {
3323     int i;
3324     DCTELEM temp[64];
3325
3326     if(last<=0) return;
3327     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3328
3329     for(i=0; i<=last; i++){
3330         const int j= scantable[i];
3331         temp[j]= block[j];
3332         block[j]=0;
3333     }
3334
3335     for(i=0; i<=last; i++){
3336         const int j= scantable[i];
3337         const int perm_j= permutation[j];
3338         block[perm_j]= temp[j];
3339     }
3340 }
3341
3342 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3343     return 0;
3344 }
3345
3346 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3347     int i;
3348
3349     memset(cmp, 0, sizeof(void*)*5);
3350
3351     for(i=0; i<5; i++){
3352         switch(type&0xFF){
3353         case FF_CMP_SAD:
3354             cmp[i]= c->sad[i];
3355             break;
3356         case FF_CMP_SATD:
3357             cmp[i]= c->hadamard8_diff[i];
3358             break;
3359         case FF_CMP_SSE:
3360             cmp[i]= c->sse[i];
3361             break;
3362         case FF_CMP_DCT:
3363             cmp[i]= c->dct_sad[i];
3364             break;
3365         case FF_CMP_DCT264:
3366             cmp[i]= c->dct264_sad[i];
3367             break;
3368         case FF_CMP_DCTMAX:
3369             cmp[i]= c->dct_max[i];
3370             break;
3371         case FF_CMP_PSNR:
3372             cmp[i]= c->quant_psnr[i];
3373             break;
3374         case FF_CMP_BIT:
3375             cmp[i]= c->bit[i];
3376             break;
3377         case FF_CMP_RD:
3378             cmp[i]= c->rd[i];
3379             break;
3380         case FF_CMP_VSAD:
3381             cmp[i]= c->vsad[i];
3382             break;
3383         case FF_CMP_VSSE:
3384             cmp[i]= c->vsse[i];
3385             break;
3386         case FF_CMP_ZERO:
3387             cmp[i]= zero_cmp;
3388             break;
3389         case FF_CMP_NSSE:
3390             cmp[i]= c->nsse[i];
3391             break;
3392 #ifdef CONFIG_SNOW_ENCODER
3393         case FF_CMP_W53:
3394             cmp[i]= c->w53[i];
3395             break;
3396         case FF_CMP_W97:
3397             cmp[i]= c->w97[i];
3398             break;
3399 #endif
3400         default:
3401             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3402         }
3403     }
3404 }
3405
3406 /**
3407  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3408  */
3409 static void clear_blocks_c(DCTELEM *blocks)
3410 {
3411     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3412 }
3413
3414 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3415     long i;
3416     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3417         long a = *(long*)(src+i);
3418         long b = *(long*)(dst+i);
3419         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3420     }
3421     for(; i<w; i++)
3422         dst[i+0] += src[i+0];
3423 }
3424
3425 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3426     long i;
3427     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3428         long a = *(long*)(src1+i);
3429         long b = *(long*)(src2+i);
3430         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3431     }
3432     for(; i<w; i++)
3433         dst[i] = src1[i]+src2[i];
3434 }
3435
3436 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3437     long i;
3438 #ifndef HAVE_FAST_UNALIGNED
3439     if((long)src2 & (sizeof(long)-1)){
3440         for(i=0; i+7<w; i+=8){
3441             dst[i+0] = src1[i+0]-src2[i+0];
3442             dst[i+1] = src1[i+1]-src2[i+1];
3443             dst[i+2] = src1[i+2]-src2[i+2];
3444             dst[i+3] = src1[i+3]-src2[i+3];
3445             dst[i+4] = src1[i+4]-src2[i+4];
3446             dst[i+5] = src1[i+5]-src2[i+5];
3447             dst[i+6] = src1[i+6]-src2[i+6];
3448             dst[i+7] = src1[i+7]-src2[i+7];
3449         }
3450     }else
3451 #endif
3452     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3453         long a = *(long*)(src1+i);
3454         long b = *(long*)(src2+i);
3455         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3456     }
3457     for(; i<w; i++)
3458         dst[i+0] = src1[i+0]-src2[i+0];
3459 }
3460
3461 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3462     int i;
3463     uint8_t l, lt;
3464
3465     l= *left;
3466     lt= *left_top;
3467
3468     for(i=0; i<w; i++){
3469         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3470         lt= src1[i];
3471         l= src2[i];
3472         dst[i]= l - pred;
3473     }
3474
3475     *left= l;
3476     *left_top= lt;
3477 }
3478
3479 #define BUTTERFLY2(o1,o2,i1,i2) \
3480 o1= (i1)+(i2);\
3481 o2= (i1)-(i2);
3482
3483 #define BUTTERFLY1(x,y) \
3484 {\
3485     int a,b;\
3486     a= x;\
3487     b= y;\
3488     x= a+b;\
3489     y= a-b;\
3490 }
3491
3492 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3493
3494 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3495     int i;
3496     int temp[64];
3497     int sum=0;
3498
3499     assert(h==8);
3500
3501     for(i=0; i<8; i++){
3502         //FIXME try pointer walks
3503         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3504         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3505         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3506         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3507
3508         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3509         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3510         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3511         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3512
3513         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3514         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3515         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3516         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3517     }
3518
3519     for(i=0; i<8; i++){
3520         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3521         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3522         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3523         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3524
3525         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3526         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3527         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3528         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3529
3530         sum +=
3531              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3532             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3533             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3534             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3535     }
3536 #if 0
3537 static int maxi=0;
3538 if(sum>maxi){
3539     maxi=sum;
3540     printf("MAX:%d\n", maxi);
3541 }
3542 #endif
3543     return sum;
3544 }
3545
3546 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3547     int i;
3548     int temp[64];
3549     int sum=0;
3550
3551     assert(h==8);
3552
3553     for(i=0; i<8; i++){
3554         //FIXME try pointer walks
3555         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3556         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3557         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3558         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3559
3560         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3561         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3562         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3563         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3564
3565         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3566         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3567         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3568         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3569     }
3570
3571     for(i=0; i<8; i++){
3572         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3573         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3574         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3575         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3576
3577         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3578         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3579         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3580         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3581
3582         sum +=
3583              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3584             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3585             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3586             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3587     }
3588
3589     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3590
3591     return sum;
3592 }
3593
3594 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3595     MpegEncContext * const s= (MpegEncContext *)c;
3596     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3597     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3598
3599     assert(h==8);
3600
3601     s->dsp.diff_pixels(temp, src1, src2, stride);
3602     s->dsp.fdct(temp);
3603     return s->dsp.sum_abs_dctelem(temp);
3604 }
3605
3606 #ifdef CONFIG_GPL
3607 #define DCT8_1D {\
3608     const int s07 = SRC(0) + SRC(7);\
3609     const int s16 = SRC(1) + SRC(6);\
3610     const int s25 = SRC(2) + SRC(5);\
3611     const int s34 = SRC(3) + SRC(4);\
3612     const int a0 = s07 + s34;\
3613     const int a1 = s16 + s25;\
3614     const int a2 = s07 - s34;\
3615     const int a3 = s16 - s25;\
3616     const int d07 = SRC(0) - SRC(7);\
3617     const int d16 = SRC(1) - SRC(6);\
3618     const int d25 = SRC(2) - SRC(5);\
3619     const int d34 = SRC(3) - SRC(4);\
3620     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3621     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3622     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3623     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3624     DST(0,  a0 + a1     ) ;\
3625     DST(1,  a4 + (a7>>2)) ;\
3626     DST(2,  a2 + (a3>>1)) ;\
3627     DST(3,  a5 + (a6>>2)) ;\
3628     DST(4,  a0 - a1     ) ;\
3629     DST(5,  a6 - (a5>>2)) ;\
3630     DST(6, (a2>>1) - a3 ) ;\
3631     DST(7, (a4>>2) - a7 ) ;\
3632 }
3633
3634 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3635     MpegEncContext * const s= (MpegEncContext *)c;
3636     DCTELEM dct[8][8];
3637     int i;
3638     int sum=0;
3639
3640     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3641
3642 #define SRC(x) dct[i][x]
3643 #define DST(x,v) dct[i][x]= v
3644     for( i = 0; i < 8; i++ )
3645         DCT8_1D
3646 #undef SRC
3647 #undef DST
3648
3649 #define SRC(x) dct[x][i]
3650 #define DST(x,v) sum += FFABS(v)
3651     for( i = 0; i < 8; i++ )
3652         DCT8_1D
3653 #undef SRC
3654 #undef DST
3655     return sum;
3656 }
3657 #endif
3658
3659 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3660     MpegEncContext * const s= (MpegEncContext *)c;
3661     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3662     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3663     int sum=0, i;
3664
3665     assert(h==8);
3666
3667     s->dsp.diff_pixels(temp, src1, src2, stride);
3668     s->dsp.fdct(temp);
3669
3670     for(i=0; i<64; i++)
3671         sum= FFMAX(sum, FFABS(temp[i]));
3672
3673     return sum;
3674 }
3675
3676 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3677     MpegEncContext * const s= (MpegEncContext *)c;
3678     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3679     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3680     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3681     int sum=0, i;
3682
3683     assert(h==8);
3684     s->mb_intra=0;
3685
3686     s->dsp.diff_pixels(temp, src1, src2, stride);
3687
3688     memcpy(bak, temp, 64*sizeof(DCTELEM));
3689
3690     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3691     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3692     ff_simple_idct(temp); //FIXME
3693
3694     for(i=0; i<64; i++)
3695         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3696
3697     return sum;
3698 }
3699
3700 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3701     MpegEncContext * const s= (MpegEncContext *)c;
3702     const uint8_t *scantable= s->intra_scantable.permutated;
3703     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3704     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3705     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3706     uint8_t * const bak= (uint8_t*)aligned_bak;
3707     int i, last, run, bits, level, distortion, start_i;
3708     const int esc_length= s->ac_esc_length;
3709     uint8_t * length;
3710     uint8_t * last_length;
3711
3712     assert(h==8);
3713
3714     for(i=0; i<8; i++){
3715         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3716         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3717     }
3718
3719     s->dsp.diff_pixels(temp, src1, src2, stride);
3720
3721     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3722
3723     bits=0;
3724
3725     if (s->mb_intra) {
3726         start_i = 1;
3727         length     = s->intra_ac_vlc_length;
3728         last_length= s->intra_ac_vlc_last_length;
3729         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3730     } else {
3731         start_i = 0;
3732         length     = s->inter_ac_vlc_length;
3733         last_length= s->inter_ac_vlc_last_length;
3734     }
3735
3736     if(last>=start_i){
3737         run=0;
3738         for(i=start_i; i<last; i++){
3739             int j= scantable[i];
3740             level= temp[j];
3741
3742             if(level){
3743                 level+=64;
3744                 if((level&(~127)) == 0){
3745                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3746                 }else
3747                     bits+= esc_length;
3748                 run=0;
3749             }else
3750                 run++;
3751         }
3752         i= scantable[last];
3753
3754         level= temp[i] + 64;
3755
3756         assert(level - 64);
3757
3758         if((level&(~127)) == 0){
3759             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3760         }else
3761             bits+= esc_length;
3762
3763     }
3764
3765     if(last>=0){
3766         if(s->mb_intra)
3767             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3768         else
3769             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3770     }
3771
3772     s->dsp.idct_add(bak, stride, temp);
3773
3774     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3775
3776     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3777 }
3778
3779 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3780     MpegEncContext * const s= (MpegEncContext *)c;
3781     const uint8_t *scantable= s->intra_scantable.permutated;
3782     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3783     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3784     int i, last, run, bits, level, start_i;
3785     const int esc_length= s->ac_esc_length;
3786     uint8_t * length;
3787     uint8_t * last_length;
3788
3789     assert(h==8);
3790
3791     s->dsp.diff_pixels(temp, src1, src2, stride);
3792
3793     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3794
3795     bits=0;
3796
3797     if (s->mb_intra) {
3798         start_i = 1;
3799         length     = s->intra_ac_vlc_length;
3800         last_length= s->intra_ac_vlc_last_length;
3801         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3802     } else {
3803         start_i = 0;
3804         length     = s->inter_ac_vlc_length;
3805         last_length= s->inter_ac_vlc_last_length;
3806     }
3807
3808     if(last>=start_i){
3809         run=0;
3810         for(i=start_i; i<last; i++){
3811             int j= scantable[i];
3812             level= temp[j];
3813
3814             if(level){
3815                 level+=64;
3816                 if((level&(~127)) == 0){
3817                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3818                 }else
3819                     bits+= esc_length;
3820                 run=0;
3821             }else
3822                 run++;
3823         }
3824         i= scantable[last];
3825
3826         level= temp[i] + 64;
3827
3828         assert(level - 64);
3829
3830         if((level&(~127)) == 0){
3831             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3832         }else
3833             bits+= esc_length;
3834     }
3835
3836     return bits;
3837 }
3838
3839 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3840     int score=0;
3841     int x,y;
3842
3843     for(y=1; y<h; y++){
3844         for(x=0; x<16; x+=4){
3845             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])
3846                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3847         }
3848         s+= stride;
3849     }
3850
3851     return score;
3852 }
3853
3854 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3855     int score=0;
3856     int x,y;
3857
3858     for(y=1; y<h; y++){
3859         for(x=0; x<16; x++){
3860             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3861         }
3862         s1+= stride;
3863         s2+= stride;
3864     }
3865
3866     return score;
3867 }
3868
3869 #define SQ(a) ((a)*(a))
3870 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3871     int score=0;
3872     int x,y;
3873
3874     for(y=1; y<h; y++){
3875         for(x=0; x<16; x+=4){
3876             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])
3877                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3878         }
3879         s+= stride;
3880     }
3881
3882     return score;
3883 }
3884
3885 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3886     int score=0;
3887     int x,y;
3888
3889     for(y=1; y<h; y++){
3890         for(x=0; x<16; x++){
3891             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3892         }
3893         s1+= stride;
3894         s2+= stride;
3895     }
3896
3897     return score;
3898 }
3899
3900 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3901                                int size){
3902     int score=0;
3903     int i;
3904     for(i=0; i<size; i++)
3905         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3906     return score;
3907 }
3908
3909 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3910 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3911 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3912 #ifdef CONFIG_GPL
3913 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3914 #endif
3915 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3916 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3917 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3918 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3919
3920 static void vector_fmul_c(float *dst, const float *src, int len){
3921     int i;
3922     for(i=0; i<len; i++)
3923         dst[i] *= src[i];
3924 }
3925
3926 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3927     int i;
3928     src1 += len-1;
3929     for(i=0; i<len; i++)
3930         dst[i] = src0[i] * src1[-i];
3931 }
3932
3933 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3934     int i;
3935     for(i=0; i<len; i++)
3936         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3937 }
3938
3939 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3940     int i,j;
3941     dst += len;
3942     win += len;
3943     src0+= len;
3944     for(i=-len, j=len-1; i<0; i++, j--) {
3945         float s0 = src0[i];
3946         float s1 = src1[j];
3947         float wi = win[i];
3948         float wj = win[j];
3949         dst[i] = s0*wj - s1*wi + add_bias;
3950         dst[j] = s0*wi + s1*wj + add_bias;
3951     }
3952 }
3953
3954 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3955     int i;
3956     for(i=0; i<len; i++)
3957         dst[i] = src[i] * mul;
3958 }
3959
3960 static av_always_inline int float_to_int16_one(const float *src){
3961     int_fast32_t tmp = *(const int32_t*)src;
3962     if(tmp & 0xf0000){
3963         tmp = (0x43c0ffff - tmp)>>31;
3964         // is this faster on some gcc/cpu combinations?
3965 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3966 //      else                 tmp = 0;
3967     }
3968     return tmp - 0x8000;
3969 }
3970
3971 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3972     int i;
3973     for(i=0; i<len; i++)
3974         dst[i] = float_to_int16_one(src+i);
3975 }
3976
3977 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3978     int i,j,c;
3979     if(channels==2){
3980         for(i=0; i<len; i++){
3981             dst[2*i]   = float_to_int16_one(src[0]+i);
3982             dst[2*i+1] = float_to_int16_one(src[1]+i);
3983         }
3984     }else{
3985         for(c=0; c<channels; c++)
3986             for(i=0, j=c; i<len; i++, j+=channels)
3987                 dst[j] = float_to_int16_one(src[c]+i);
3988     }
3989 }
3990
3991 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3992 {
3993     while (order--)
3994        *v1++ += *v2++;
3995 }
3996
3997 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3998 {
3999     while (order--)
4000         *v1++ -= *v2++;
4001 }
4002
4003 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4004 {
4005     int res = 0;
4006
4007     while (order--)
4008         res += (*v1++ * *v2++) >> shift;
4009
4010     return res;
4011 }
4012
4013 #define W0 2048
4014 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4015 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4016 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4017 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4018 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4019 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4020 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4021
4022 static void wmv2_idct_row(short * b)
4023 {
4024     int s1,s2;
4025     int a0,a1,a2,a3,a4,a5,a6,a7;
4026     /*step 1*/
4027     a1 = W1*b[1]+W7*b[7];
4028     a7 = W7*b[1]-W1*b[7];
4029     a5 = W5*b[5]+W3*b[3];
4030     a3 = W3*b[5]-W5*b[3];
4031     a2 = W2*b[2]+W6*b[6];
4032     a6 = W6*b[2]-W2*b[6];
4033     a0 = W0*b[0]+W0*b[4];
4034     a4 = W0*b[0]-W0*b[4];
4035     /*step 2*/
4036     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4037     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4038     /*step 3*/
4039     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4040     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4041     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4042     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4043     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4044     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4045     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4046     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4047 }
4048 static void wmv2_idct_col(short * b)
4049 {
4050     int s1,s2;
4051     int a0,a1,a2,a3,a4,a5,a6,a7;
4052     /*step 1, with extended precision*/
4053     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4054     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4055     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4056     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4057     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4058     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4059     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4060     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4061     /*step 2*/
4062     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4063     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4064     /*step 3*/
4065     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4066     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4067     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4068     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4069
4070     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4071     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4072     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4073     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4074 }
4075 void ff_wmv2_idct_c(short * block){
4076     int i;
4077
4078     for(i=0;i<64;i+=8){
4079         wmv2_idct_row(block+i);
4080     }
4081     for(i=0;i<8;i++){
4082         wmv2_idct_col(block+i);
4083     }
4084 }
4085 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4086  converted */
4087 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4088 {
4089     ff_wmv2_idct_c(block);
4090     put_pixels_clamped_c(block, dest, line_size);
4091 }
4092 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4093 {
4094     ff_wmv2_idct_c(block);
4095     add_pixels_clamped_c(block, dest, line_size);
4096 }
4097 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4098 {
4099     j_rev_dct (block);
4100     put_pixels_clamped_c(block, dest, line_size);
4101 }
4102 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4103 {
4104     j_rev_dct (block);
4105     add_pixels_clamped_c(block, dest, line_size);
4106 }
4107
4108 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4109 {
4110     j_rev_dct4 (block);
4111     put_pixels_clamped4_c(block, dest, line_size);
4112 }
4113 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4114 {
4115     j_rev_dct4 (block);
4116     add_pixels_clamped4_c(block, dest, line_size);
4117 }
4118
4119 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4120 {
4121     j_rev_dct2 (block);
4122     put_pixels_clamped2_c(block, dest, line_size);
4123 }
4124 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4125 {
4126     j_rev_dct2 (block);
4127     add_pixels_clamped2_c(block, dest, line_size);
4128 }
4129
4130 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4131 {
4132     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4133
4134     dest[0] = cm[(block[0] + 4)>>3];
4135 }
4136 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4137 {
4138     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4139
4140     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4141 }
4142
4143 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4144
4145 /* init static data */
4146 void dsputil_static_init(void)
4147 {
4148     int i;
4149
4150     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4151     for(i=0;i<MAX_NEG_CROP;i++) {
4152         ff_cropTbl[i] = 0;
4153         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4154     }
4155
4156     for(i=0;i<512;i++) {
4157         ff_squareTbl[i] = (i - 256) * (i - 256);
4158     }
4159
4160     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4161 }
4162
4163 int ff_check_alignment(void){
4164     static int did_fail=0;
4165     DECLARE_ALIGNED_16(int, aligned);
4166
4167     if((long)&aligned & 15){
4168         if(!did_fail){
4169 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4170             av_log(NULL, AV_LOG_ERROR,
4171                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4172                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4173                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4174                 "Do not report crashes to FFmpeg developers.\n");
4175 #endif
4176             did_fail=1;
4177         }
4178         return -1;
4179     }
4180     return 0;
4181 }
4182
4183 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4184 {
4185     int i;
4186
4187     ff_check_alignment();
4188
4189 #ifdef CONFIG_ENCODERS
4190     if(avctx->dct_algo==FF_DCT_FASTINT) {
4191         c->fdct = fdct_ifast;
4192         c->fdct248 = fdct_ifast248;
4193     }
4194     else if(avctx->dct_algo==FF_DCT_FAAN) {
4195         c->fdct = ff_faandct;
4196         c->fdct248 = ff_faandct248;
4197     }
4198     else {
4199         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4200         c->fdct248 = ff_fdct248_islow;
4201     }
4202 #endif //CONFIG_ENCODERS
4203
4204     if(avctx->lowres==1){
4205         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4206             c->idct_put= ff_jref_idct4_put;
4207             c->idct_add= ff_jref_idct4_add;
4208         }else{
4209             c->idct_put= ff_h264_lowres_idct_put_c;
4210             c->idct_add= ff_h264_lowres_idct_add_c;
4211         }
4212         c->idct    = j_rev_dct4;
4213         c->idct_permutation_type= FF_NO_IDCT_PERM;
4214     }else if(avctx->lowres==2){
4215         c->idct_put= ff_jref_idct2_put;
4216         c->idct_add= ff_jref_idct2_add;
4217         c->idct    = j_rev_dct2;
4218         c->idct_permutation_type= FF_NO_IDCT_PERM;
4219     }else if(avctx->lowres==3){
4220         c->idct_put= ff_jref_idct1_put;
4221         c->idct_add= ff_jref_idct1_add;
4222         c->idct    = j_rev_dct1;
4223         c->idct_permutation_type= FF_NO_IDCT_PERM;
4224     }else{
4225         if(avctx->idct_algo==FF_IDCT_INT){
4226             c->idct_put= ff_jref_idct_put;
4227             c->idct_add= ff_jref_idct_add;
4228             c->idct    = j_rev_dct;
4229             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4230         }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4231                 avctx->idct_algo==FF_IDCT_VP3){
4232             c->idct_put= ff_vp3_idct_put_c;
4233             c->idct_add= ff_vp3_idct_add_c;
4234             c->idct    = ff_vp3_idct_c;
4235             c->idct_permutation_type= FF_NO_IDCT_PERM;
4236         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4237             c->idct_put= ff_wmv2_idct_put_c;
4238             c->idct_add= ff_wmv2_idct_add_c;
4239             c->idct    = ff_wmv2_idct_c;
4240             c->idct_permutation_type= FF_NO_IDCT_PERM;
4241         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4242             c->idct_put= ff_faanidct_put;
4243             c->idct_add= ff_faanidct_add;
4244             c->idct    = ff_faanidct;
4245             c->idct_permutation_type= FF_NO_IDCT_PERM;
4246         }else if(ENABLE_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4247             c->idct_put= ff_ea_idct_put_c;
4248             c->idct_permutation_type= FF_NO_IDCT_PERM;
4249         }else{ //accurate/default
4250             c->idct_put= ff_simple_idct_put;
4251             c->idct_add= ff_simple_idct_add;
4252             c->idct    = ff_simple_idct;
4253             c->idct_permutation_type= FF_NO_IDCT_PERM;
4254         }
4255     }
4256
4257     if (ENABLE_H264_DECODER) {
4258         c->h264_idct_add= ff_h264_idct_add_c;
4259         c->h264_idct8_add= ff_h264_idct8_add_c;
4260         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4261         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4262     }
4263
4264     c->get_pixels = get_pixels_c;
4265     c->diff_pixels = diff_pixels_c;
4266     c->put_pixels_clamped = put_pixels_clamped_c;
4267     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4268     c->add_pixels_clamped = add_pixels_clamped_c;
4269     c->add_pixels8 = add_pixels8_c;
4270     c->add_pixels4 = add_pixels4_c;
4271     c->sum_abs_dctelem = sum_abs_dctelem_c;
4272     c->gmc1 = gmc1_c;
4273     c->gmc = ff_gmc_c;
4274     c->clear_blocks = clear_blocks_c;
4275     c->pix_sum = pix_sum_c;
4276     c->pix_norm1 = pix_norm1_c;
4277
4278     /* TODO [0] 16  [1] 8 */
4279     c->pix_abs[0][0] = pix_abs16_c;
4280     c->pix_abs[0][1] = pix_abs16_x2_c;
4281     c->pix_abs[0][2] = pix_abs16_y2_c;
4282     c->pix_abs[0][3] = pix_abs16_xy2_c;
4283     c->pix_abs[1][0] = pix_abs8_c;
4284     c->pix_abs[1][1] = pix_abs8_x2_c;
4285     c->pix_abs[1][2] = pix_abs8_y2_c;
4286     c->pix_abs[1][3] = pix_abs8_xy2_c;
4287
4288 #define dspfunc(PFX, IDX, NUM) \
4289     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4290     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4291     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4292     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4293
4294     dspfunc(put, 0, 16);
4295     dspfunc(put_no_rnd, 0, 16);
4296     dspfunc(put, 1, 8);
4297     dspfunc(put_no_rnd, 1, 8);
4298     dspfunc(put, 2, 4);
4299     dspfunc(put, 3, 2);
4300
4301     dspfunc(avg, 0, 16);
4302     dspfunc(avg_no_rnd, 0, 16);
4303     dspfunc(avg, 1, 8);
4304     dspfunc(avg_no_rnd, 1, 8);
4305     dspfunc(avg, 2, 4);
4306     dspfunc(avg, 3, 2);
4307 #undef dspfunc
4308
4309     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4310     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4311
4312     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4313     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4314     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4315     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4316     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4317     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4318     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4319     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4320     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4321
4322     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4323     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4324     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4325     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4326     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4327     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4328     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4329     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4330     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4331
4332 #define dspfunc(PFX, IDX, NUM) \
4333     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4334     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4335     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4336     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4337     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4338     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4339     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4340     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4341     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4342     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4343     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4344     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4345     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4346     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4347     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4348     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4349
4350     dspfunc(put_qpel, 0, 16);
4351     dspfunc(put_no_rnd_qpel, 0, 16);
4352
4353     dspfunc(avg_qpel, 0, 16);
4354     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4355
4356     dspfunc(put_qpel, 1, 8);
4357     dspfunc(put_no_rnd_qpel, 1, 8);
4358
4359     dspfunc(avg_qpel, 1, 8);
4360     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4361
4362     dspfunc(put_h264_qpel, 0, 16);
4363     dspfunc(put_h264_qpel, 1, 8);
4364     dspfunc(put_h264_qpel, 2, 4);
4365     dspfunc(put_h264_qpel, 3, 2);
4366     dspfunc(avg_h264_qpel, 0, 16);
4367     dspfunc(avg_h264_qpel, 1, 8);
4368     dspfunc(avg_h264_qpel, 2, 4);
4369
4370 #undef dspfunc
4371     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4372     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4373     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4374     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4375     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4376     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4377     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4378
4379     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4380     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4381     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4382     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4383     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4384     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4385     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4386     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4387     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4388     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4389     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4390     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4391     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4392     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4393     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4394     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4395     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4396     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4397     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4398     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4399
4400     c->draw_edges = draw_edges_c;
4401
4402 #ifdef CONFIG_CAVS_DECODER
4403     ff_cavsdsp_init(c,avctx);
4404 #endif
4405 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4406     ff_vc1dsp_init(c,avctx);
4407 #endif
4408 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4409     ff_intrax8dsp_init(c,avctx);
4410 #endif
4411 #if defined(CONFIG_H264_ENCODER)
4412     ff_h264dspenc_init(c,avctx);
4413 #endif
4414
4415     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4416     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4417     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4418     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4419     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4420     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4421     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4422     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4423
4424 #define SET_CMP_FUNC(name) \
4425     c->name[0]= name ## 16_c;\
4426     c->name[1]= name ## 8x8_c;
4427
4428     SET_CMP_FUNC(hadamard8_diff)
4429     c->hadamard8_diff[4]= hadamard8_intra16_c;
4430     SET_CMP_FUNC(dct_sad)
4431     SET_CMP_FUNC(dct_max)
4432 #ifdef CONFIG_GPL
4433     SET_CMP_FUNC(dct264_sad)
4434 #endif
4435     c->sad[0]= pix_abs16_c;
4436     c->sad[1]= pix_abs8_c;
4437     c->sse[0]= sse16_c;
4438     c->sse[1]= sse8_c;
4439     c->sse[2]= sse4_c;
4440     SET_CMP_FUNC(quant_psnr)
4441     SET_CMP_FUNC(rd)
4442     SET_CMP_FUNC(bit)
4443     c->vsad[0]= vsad16_c;
4444     c->vsad[4]= vsad_intra16_c;
4445     c->vsse[0]= vsse16_c;
4446     c->vsse[4]= vsse_intra16_c;
4447     c->nsse[0]= nsse16_c;
4448     c->nsse[1]= nsse8_c;
4449 #ifdef CONFIG_SNOW_ENCODER
4450     c->w53[0]= w53_16_c;
4451     c->w53[1]= w53_8_c;
4452     c->w97[0]= w97_16_c;
4453     c->w97[1]= w97_8_c;
4454 #endif
4455
4456     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4457
4458     c->add_bytes= add_bytes_c;
4459     c->add_bytes_l2= add_bytes_l2_c;
4460     c->diff_bytes= diff_bytes_c;
4461     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4462     c->bswap_buf= bswap_buf;
4463 #ifdef CONFIG_PNG_DECODER
4464     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4465 #endif
4466
4467     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4468     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4469     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4470     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4471     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4472     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4473     c->h264_loop_filter_strength= NULL;
4474
4475     if (ENABLE_ANY_H263) {
4476         c->h263_h_loop_filter= h263_h_loop_filter_c;
4477         c->h263_v_loop_filter= h263_v_loop_filter_c;
4478     }
4479
4480     if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4481         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4482         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4483     }
4484
4485     c->h261_loop_filter= h261_loop_filter_c;
4486
4487     c->try_8x8basis= try_8x8basis_c;
4488     c->add_8x8basis= add_8x8basis_c;
4489
4490 #ifdef CONFIG_SNOW_DECODER
4491     c->vertical_compose97i = ff_snow_vertical_compose97i;
4492     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4493     c->inner_add_yblock = ff_snow_inner_add_yblock;
4494 #endif
4495
4496 #ifdef CONFIG_VORBIS_DECODER
4497     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4498 #endif
4499 #ifdef CONFIG_AC3_DECODER
4500     c->ac3_downmix = ff_ac3_downmix_c;
4501 #endif
4502 #ifdef CONFIG_FLAC_ENCODER
4503     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4504 #endif
4505     c->vector_fmul = vector_fmul_c;
4506     c->vector_fmul_reverse = vector_fmul_reverse_c;
4507     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4508     c->vector_fmul_window = ff_vector_fmul_window_c;
4509     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4510     c->float_to_int16 = ff_float_to_int16_c;
4511     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4512     c->add_int16 = add_int16_c;
4513     c->sub_int16 = sub_int16_c;
4514     c->scalarproduct_int16 = scalarproduct_int16_c;
4515
4516     c->shrink[0]= ff_img_copy_plane;
4517     c->shrink[1]= ff_shrink22;
4518     c->shrink[2]= ff_shrink44;
4519     c->shrink[3]= ff_shrink88;
4520
4521     c->prefetch= just_return;
4522
4523     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4524     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4525
4526     if (ENABLE_MMX)      dsputil_init_mmx   (c, avctx);
4527     if (ENABLE_ARMV4L)   dsputil_init_armv4l(c, avctx);
4528     if (ENABLE_MLIB)     dsputil_init_mlib  (c, avctx);
4529     if (ENABLE_VIS)      dsputil_init_vis   (c, avctx);
4530     if (ENABLE_ALPHA)    dsputil_init_alpha (c, avctx);
4531     if (ENABLE_POWERPC)  dsputil_init_ppc   (c, avctx);
4532     if (ENABLE_MMI)      dsputil_init_mmi   (c, avctx);
4533     if (ENABLE_SH4)      dsputil_init_sh4   (c, avctx);
4534     if (ENABLE_BFIN)     dsputil_init_bfin  (c, avctx);
4535
4536     for(i=0; i<64; i++){
4537         if(!c->put_2tap_qpel_pixels_tab[0][i])
4538             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4539         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4540             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4541     }
4542
4543     switch(c->idct_permutation_type){
4544     case FF_NO_IDCT_PERM:
4545         for(i=0; i<64; i++)
4546             c->idct_permutation[i]= i;
4547         break;
4548     case FF_LIBMPEG2_IDCT_PERM:
4549         for(i=0; i<64; i++)
4550             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4551         break;
4552     case FF_SIMPLE_IDCT_PERM:
4553         for(i=0; i<64; i++)
4554             c->idct_permutation[i]= simple_mmx_permutation[i];
4555         break;
4556     case FF_TRANSPOSE_IDCT_PERM:
4557         for(i=0; i<64; i++)
4558             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4559         break;
4560     case FF_PARTTRANS_IDCT_PERM:
4561         for(i=0; i<64; i++)
4562             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4563         break;
4564     case FF_SSE2_IDCT_PERM:
4565         for(i=0; i<64; i++)
4566             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4567         break;
4568     default:
4569         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4570     }
4571 }
4572