libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "h263.h"
  37 #include "snow.h"
  38
  39 /* snow.c */
  40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  41
  42 /* vorbis.c */
  43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  44
  45 /* ac3dec.c */
  46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  47
  48 /* flacenc.c */
  49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  50
  51 /* pngdec.c */
  52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  53
  54 /* eaidct.c */
  55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  56
  57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  58 uint32_t ff_squareTbl[512] = {0, };
  59
  60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  61 #define pb_7f (~0UL/255 * 0x7f)
  62 #define pb_80 (~0UL/255 * 0x80)
  63
  64 const uint8_t ff_zigzag_direct[64] = {
  65     0,   1,  8, 16,  9,  2,  3, 10,
  66     17, 24, 32, 25, 18, 11,  4,  5,
  67     12, 19, 26, 33, 40, 48, 41, 34,
  68     27, 20, 13,  6,  7, 14, 21, 28,
  69     35, 42, 49, 56, 57, 50, 43, 36,
  70     29, 22, 15, 23, 30, 37, 44, 51,
  71     58, 59, 52, 45, 38, 31, 39, 46,
  72     53, 60, 61, 54, 47, 55, 62, 63
  73 };
  74
  75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  76    specification, we interleave the fields */
  77 const uint8_t ff_zigzag248_direct[64] = {
  78      0,  8,  1,  9, 16, 24,  2, 10,
  79     17, 25, 32, 40, 48, 56, 33, 41,
  80     18, 26,  3, 11,  4, 12, 19, 27,
  81     34, 42, 49, 57, 50, 58, 35, 43,
  82     20, 28,  5, 13,  6, 14, 21, 29,
  83     36, 44, 51, 59, 52, 60, 37, 45,
  84     22, 30,  7, 15, 23, 31, 38, 46,
  85     53, 61, 54, 62, 39, 47, 55, 63,
  86 };
  87
  88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  90
  91 const uint8_t ff_alternate_horizontal_scan[64] = {
  92     0,  1,   2,  3,  8,  9, 16, 17,
  93     10, 11,  4,  5,  6,  7, 15, 14,
  94     13, 12, 19, 18, 24, 25, 32, 33,
  95     26, 27, 20, 21, 22, 23, 28, 29,
  96     30, 31, 34, 35, 40, 41, 48, 49,
  97     42, 43, 36, 37, 38, 39, 44, 45,
  98     46, 47, 50, 51, 56, 57, 58, 59,
  99     52, 53, 54, 55, 60, 61, 62, 63,
 100 };
 101
 102 const uint8_t ff_alternate_vertical_scan[64] = {
 103     0,  8,  16, 24,  1,  9,  2, 10,
 104     17, 25, 32, 40, 48, 56, 57, 49,
 105     41, 33, 26, 18,  3, 11,  4, 12,
 106     19, 27, 34, 42, 50, 58, 35, 43,
 107     51, 59, 20, 28,  5, 13,  6, 14,
 108     21, 29, 36, 44, 52, 60, 37, 45,
 109     53, 61, 22, 30,  7, 15, 23, 31,
 110     38, 46, 54, 62, 39, 47, 55, 63,
 111 };
 112
 113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 114 const uint32_t ff_inverse[256]={
 115          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 116  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 117  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 118  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 119  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 120  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 121   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 122   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 123   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 124   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 125   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 126   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 127   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 128   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 129   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 130   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 131   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 132   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 133   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 134   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 135   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 136   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 137   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 138   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 139   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 140   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 141   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 142   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 143   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 144   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 145   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 146   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 147 };
 148
 149 /* Input permutation for the simple_idct_mmx */
 150 static const uint8_t simple_mmx_permutation[64]={
 151         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 152         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 153         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 154         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 155         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 156         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 157         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 158         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 159 };
 160
 161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 162
 163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 164     int i;
 165     int end;
 166
 167     st->scantable= src_scantable;
 168
 169     for(i=0; i<64; i++){
 170         int j;
 171         j = src_scantable[i];
 172         st->permutated[i] = permutation[j];
 173 #if ARCH_PPC
 174         st->inverse[j] = i;
 175 #endif
 176     }
 177
 178     end=-1;
 179     for(i=0; i<64; i++){
 180         int j;
 181         j = st->permutated[i];
 182         if(j>end) end=j;
 183         st->raster_end[i]= end;
 184     }
 185 }
 186
 187 static int pix_sum_c(uint8_t * pix, int line_size)
 188 {
 189     int s, i, j;
 190
 191     s = 0;
 192     for (i = 0; i < 16; i++) {
 193         for (j = 0; j < 16; j += 8) {
 194             s += pix[0];
 195             s += pix[1];
 196             s += pix[2];
 197             s += pix[3];
 198             s += pix[4];
 199             s += pix[5];
 200             s += pix[6];
 201             s += pix[7];
 202             pix += 8;
 203         }
 204         pix += line_size - 16;
 205     }
 206     return s;
 207 }
 208
 209 static int pix_norm1_c(uint8_t * pix, int line_size)
 210 {
 211     int s, i, j;
 212     uint32_t *sq = ff_squareTbl + 256;
 213
 214     s = 0;
 215     for (i = 0; i < 16; i++) {
 216         for (j = 0; j < 16; j += 8) {
 217 #if 0
 218             s += sq[pix[0]];
 219             s += sq[pix[1]];
 220             s += sq[pix[2]];
 221             s += sq[pix[3]];
 222             s += sq[pix[4]];
 223             s += sq[pix[5]];
 224             s += sq[pix[6]];
 225             s += sq[pix[7]];
 226 #else
 227 #if LONG_MAX > 2147483647
 228             register uint64_t x=*(uint64_t*)pix;
 229             s += sq[x&0xff];
 230             s += sq[(x>>8)&0xff];
 231             s += sq[(x>>16)&0xff];
 232             s += sq[(x>>24)&0xff];
 233             s += sq[(x>>32)&0xff];
 234             s += sq[(x>>40)&0xff];
 235             s += sq[(x>>48)&0xff];
 236             s += sq[(x>>56)&0xff];
 237 #else
 238             register uint32_t x=*(uint32_t*)pix;
 239             s += sq[x&0xff];
 240             s += sq[(x>>8)&0xff];
 241             s += sq[(x>>16)&0xff];
 242             s += sq[(x>>24)&0xff];
 243             x=*(uint32_t*)(pix+4);
 244             s += sq[x&0xff];
 245             s += sq[(x>>8)&0xff];
 246             s += sq[(x>>16)&0xff];
 247             s += sq[(x>>24)&0xff];
 248 #endif
 249 #endif
 250             pix += 8;
 251         }
 252         pix += line_size - 16;
 253     }
 254     return s;
 255 }
 256
 257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 258     int i;
 259
 260     for(i=0; i+8<=w; i+=8){
 261         dst[i+0]= bswap_32(src[i+0]);
 262         dst[i+1]= bswap_32(src[i+1]);
 263         dst[i+2]= bswap_32(src[i+2]);
 264         dst[i+3]= bswap_32(src[i+3]);
 265         dst[i+4]= bswap_32(src[i+4]);
 266         dst[i+5]= bswap_32(src[i+5]);
 267         dst[i+6]= bswap_32(src[i+6]);
 268         dst[i+7]= bswap_32(src[i+7]);
 269     }
 270     for(;i<w; i++){
 271         dst[i+0]= bswap_32(src[i+0]);
 272     }
 273 }
 274
 275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289     return s;
 290 }
 291
 292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 293 {
 294     int s, i;
 295     uint32_t *sq = ff_squareTbl + 256;
 296
 297     s = 0;
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0] - pix2[0]];
 300         s += sq[pix1[1] - pix2[1]];
 301         s += sq[pix1[2] - pix2[2]];
 302         s += sq[pix1[3] - pix2[3]];
 303         s += sq[pix1[4] - pix2[4]];
 304         s += sq[pix1[5] - pix2[5]];
 305         s += sq[pix1[6] - pix2[6]];
 306         s += sq[pix1[7] - pix2[7]];
 307         pix1 += line_size;
 308         pix2 += line_size;
 309     }
 310     return s;
 311 }
 312
 313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 314 {
 315     int s, i;
 316     uint32_t *sq = ff_squareTbl + 256;
 317
 318     s = 0;
 319     for (i = 0; i < h; i++) {
 320         s += sq[pix1[ 0] - pix2[ 0]];
 321         s += sq[pix1[ 1] - pix2[ 1]];
 322         s += sq[pix1[ 2] - pix2[ 2]];
 323         s += sq[pix1[ 3] - pix2[ 3]];
 324         s += sq[pix1[ 4] - pix2[ 4]];
 325         s += sq[pix1[ 5] - pix2[ 5]];
 326         s += sq[pix1[ 6] - pix2[ 6]];
 327         s += sq[pix1[ 7] - pix2[ 7]];
 328         s += sq[pix1[ 8] - pix2[ 8]];
 329         s += sq[pix1[ 9] - pix2[ 9]];
 330         s += sq[pix1[10] - pix2[10]];
 331         s += sq[pix1[11] - pix2[11]];
 332         s += sq[pix1[12] - pix2[12]];
 333         s += sq[pix1[13] - pix2[13]];
 334         s += sq[pix1[14] - pix2[14]];
 335         s += sq[pix1[15] - pix2[15]];
 336
 337         pix1 += line_size;
 338         pix2 += line_size;
 339     }
 340     return s;
 341 }
 342
 343
 344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 346     int s, i, j;
 347     const int dec_count= w==8 ? 3 : 4;
 348     int tmp[32*32];
 349     int level, ori;
 350     static const int scale[2][2][4][4]={
 351       {
 352         {
 353             // 9/7 8x8 dec=3
 354             {268, 239, 239, 213},
 355             {  0, 224, 224, 152},
 356             {  0, 135, 135, 110},
 357         },{
 358             // 9/7 16x16 or 32x32 dec=4
 359             {344, 310, 310, 280},
 360             {  0, 320, 320, 228},
 361             {  0, 175, 175, 136},
 362             {  0, 129, 129, 102},
 363         }
 364       },{
 365         {
 366             // 5/3 8x8 dec=3
 367             {275, 245, 245, 218},
 368             {  0, 230, 230, 156},
 369             {  0, 138, 138, 113},
 370         },{
 371             // 5/3 16x16 or 32x32 dec=4
 372             {352, 317, 317, 286},
 373             {  0, 328, 328, 233},
 374             {  0, 180, 180, 140},
 375             {  0, 132, 132, 105},
 376         }
 377       }
 378     };
 379
 380     for (i = 0; i < h; i++) {
 381         for (j = 0; j < w; j+=4) {
 382             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 383             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 384             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 385             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 386         }
 387         pix1 += line_size;
 388         pix2 += line_size;
 389     }
 390
 391     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 392
 393     s=0;
 394     assert(w==h);
 395     for(level=0; level<dec_count; level++){
 396         for(ori= level ? 1 : 0; ori<4; ori++){
 397             int size= w>>(dec_count-level);
 398             int sx= (ori&1) ? size : 0;
 399             int stride= 32<<(dec_count-level);
 400             int sy= (ori&2) ? stride>>1 : 0;
 401
 402             for(i=0; i<size; i++){
 403                 for(j=0; j<size; j++){
 404                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 405                     s += FFABS(v);
 406                 }
 407             }
 408         }
 409     }
 410     assert(s>=0);
 411     return s>>9;
 412 }
 413
 414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 415     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 416 }
 417
 418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 419     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 420 }
 421
 422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 423     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 424 }
 425
 426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 427     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 428 }
 429
 430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 431     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 432 }
 433
 434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 435     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 436 }
 437 #endif
 438
 439 /* draw the edges of width 'w' of an image of size width, height */
 440 //FIXME check that this is ok for mpeg4 interlaced
 441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 442 {
 443     uint8_t *ptr, *last_line;
 444     int i;
 445
 446     last_line = buf + (height - 1) * wrap;
 447     for(i=0;i<w;i++) {
 448         /* top and bottom */
 449         memcpy(buf - (i + 1) * wrap, buf, width);
 450         memcpy(last_line + (i + 1) * wrap, last_line, width);
 451     }
 452     /* left and right */
 453     ptr = buf;
 454     for(i=0;i<height;i++) {
 455         memset(ptr - w, ptr[0], w);
 456         memset(ptr + width, ptr[width-1], w);
 457         ptr += wrap;
 458     }
 459     /* corners */
 460     for(i=0;i<w;i++) {
 461         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 462         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 463         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 464         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 465     }
 466 }
 467
 468 /**
 469  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 470  * @param buf destination buffer
 471  * @param src source buffer
 472  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 473  * @param block_w width of block
 474  * @param block_h height of block
 475  * @param src_x x coordinate of the top left sample of the block in the source buffer
 476  * @param src_y y coordinate of the top left sample of the block in the source buffer
 477  * @param w width of the source buffer
 478  * @param h height of the source buffer
 479  */
 480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 481                                     int src_x, int src_y, int w, int h){
 482     int x, y;
 483     int start_y, start_x, end_y, end_x;
 484
 485     if(src_y>= h){
 486         src+= (h-1-src_y)*linesize;
 487         src_y=h-1;
 488     }else if(src_y<=-block_h){
 489         src+= (1-block_h-src_y)*linesize;
 490         src_y=1-block_h;
 491     }
 492     if(src_x>= w){
 493         src+= (w-1-src_x);
 494         src_x=w-1;
 495     }else if(src_x<=-block_w){
 496         src+= (1-block_w-src_x);
 497         src_x=1-block_w;
 498     }
 499
 500     start_y= FFMAX(0, -src_y);
 501     start_x= FFMAX(0, -src_x);
 502     end_y= FFMIN(block_h, h-src_y);
 503     end_x= FFMIN(block_w, w-src_x);
 504
 505     // copy existing part
 506     for(y=start_y; y<end_y; y++){
 507         for(x=start_x; x<end_x; x++){
 508             buf[x + y*linesize]= src[x + y*linesize];
 509         }
 510     }
 511
 512     //top
 513     for(y=0; y<start_y; y++){
 514         for(x=start_x; x<end_x; x++){
 515             buf[x + y*linesize]= buf[x + start_y*linesize];
 516         }
 517     }
 518
 519     //bottom
 520     for(y=end_y; y<block_h; y++){
 521         for(x=start_x; x<end_x; x++){
 522             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 523         }
 524     }
 525
 526     for(y=0; y<block_h; y++){
 527        //left
 528         for(x=0; x<start_x; x++){
 529             buf[x + y*linesize]= buf[start_x + y*linesize];
 530         }
 531
 532        //right
 533         for(x=end_x; x<block_w; x++){
 534             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 535         }
 536     }
 537 }
 538
 539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 540 {
 541     int i;
 542
 543     /* read the pixels */
 544     for(i=0;i<8;i++) {
 545         block[0] = pixels[0];
 546         block[1] = pixels[1];
 547         block[2] = pixels[2];
 548         block[3] = pixels[3];
 549         block[4] = pixels[4];
 550         block[5] = pixels[5];
 551         block[6] = pixels[6];
 552         block[7] = pixels[7];
 553         pixels += line_size;
 554         block += 8;
 555     }
 556 }
 557
 558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 559                           const uint8_t *s2, int stride){
 560     int i;
 561
 562     /* read the pixels */
 563     for(i=0;i<8;i++) {
 564         block[0] = s1[0] - s2[0];
 565         block[1] = s1[1] - s2[1];
 566         block[2] = s1[2] - s2[2];
 567         block[3] = s1[3] - s2[3];
 568         block[4] = s1[4] - s2[4];
 569         block[5] = s1[5] - s2[5];
 570         block[6] = s1[6] - s2[6];
 571         block[7] = s1[7] - s2[7];
 572         s1 += stride;
 573         s2 += stride;
 574         block += 8;
 575     }
 576 }
 577
 578
 579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 580                                  int line_size)
 581 {
 582     int i;
 583     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 584
 585     /* read the pixels */
 586     for(i=0;i<8;i++) {
 587         pixels[0] = cm[block[0]];
 588         pixels[1] = cm[block[1]];
 589         pixels[2] = cm[block[2]];
 590         pixels[3] = cm[block[3]];
 591         pixels[4] = cm[block[4]];
 592         pixels[5] = cm[block[5]];
 593         pixels[6] = cm[block[6]];
 594         pixels[7] = cm[block[7]];
 595
 596         pixels += line_size;
 597         block += 8;
 598     }
 599 }
 600
 601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 602                                  int line_size)
 603 {
 604     int i;
 605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 606
 607     /* read the pixels */
 608     for(i=0;i<4;i++) {
 609         pixels[0] = cm[block[0]];
 610         pixels[1] = cm[block[1]];
 611         pixels[2] = cm[block[2]];
 612         pixels[3] = cm[block[3]];
 613
 614         pixels += line_size;
 615         block += 8;
 616     }
 617 }
 618
 619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 620                                  int line_size)
 621 {
 622     int i;
 623     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 624
 625     /* read the pixels */
 626     for(i=0;i<2;i++) {
 627         pixels[0] = cm[block[0]];
 628         pixels[1] = cm[block[1]];
 629
 630         pixels += line_size;
 631         block += 8;
 632     }
 633 }
 634
 635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 636                                         uint8_t *restrict pixels,
 637                                         int line_size)
 638 {
 639     int i, j;
 640
 641     for (i = 0; i < 8; i++) {
 642         for (j = 0; j < 8; j++) {
 643             if (*block < -128)
 644                 *pixels = 0;
 645             else if (*block > 127)
 646                 *pixels = 255;
 647             else
 648                 *pixels = (uint8_t)(*block + 128);
 649             block++;
 650             pixels++;
 651         }
 652         pixels += (line_size - 8);
 653     }
 654 }
 655
 656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 657                           int line_size)
 658 {
 659     int i;
 660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 661
 662     /* read the pixels */
 663     for(i=0;i<8;i++) {
 664         pixels[0] = cm[pixels[0] + block[0]];
 665         pixels[1] = cm[pixels[1] + block[1]];
 666         pixels[2] = cm[pixels[2] + block[2]];
 667         pixels[3] = cm[pixels[3] + block[3]];
 668         pixels[4] = cm[pixels[4] + block[4]];
 669         pixels[5] = cm[pixels[5] + block[5]];
 670         pixels[6] = cm[pixels[6] + block[6]];
 671         pixels[7] = cm[pixels[7] + block[7]];
 672         pixels += line_size;
 673         block += 8;
 674     }
 675 }
 676
 677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 678                           int line_size)
 679 {
 680     int i;
 681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 682
 683     /* read the pixels */
 684     for(i=0;i<4;i++) {
 685         pixels[0] = cm[pixels[0] + block[0]];
 686         pixels[1] = cm[pixels[1] + block[1]];
 687         pixels[2] = cm[pixels[2] + block[2]];
 688         pixels[3] = cm[pixels[3] + block[3]];
 689         pixels += line_size;
 690         block += 8;
 691     }
 692 }
 693
 694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 695                           int line_size)
 696 {
 697     int i;
 698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 699
 700     /* read the pixels */
 701     for(i=0;i<2;i++) {
 702         pixels[0] = cm[pixels[0] + block[0]];
 703         pixels[1] = cm[pixels[1] + block[1]];
 704         pixels += line_size;
 705         block += 8;
 706     }
 707 }
 708
 709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 710 {
 711     int i;
 712     for(i=0;i<8;i++) {
 713         pixels[0] += block[0];
 714         pixels[1] += block[1];
 715         pixels[2] += block[2];
 716         pixels[3] += block[3];
 717         pixels[4] += block[4];
 718         pixels[5] += block[5];
 719         pixels[6] += block[6];
 720         pixels[7] += block[7];
 721         pixels += line_size;
 722         block += 8;
 723     }
 724 }
 725
 726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 727 {
 728     int i;
 729     for(i=0;i<4;i++) {
 730         pixels[0] += block[0];
 731         pixels[1] += block[1];
 732         pixels[2] += block[2];
 733         pixels[3] += block[3];
 734         pixels += line_size;
 735         block += 4;
 736     }
 737 }
 738
 739 static int sum_abs_dctelem_c(DCTELEM *block)
 740 {
 741     int sum=0, i;
 742     for(i=0; i<64; i++)
 743         sum+= FFABS(block[i]);
 744     return sum;
 745 }
 746
 747 #if 0
 748
 749 #define PIXOP2(OPNAME, OP) \
 750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 751 {\
 752     int i;\
 753     for(i=0; i<h; i++){\
 754         OP(*((uint64_t*)block), AV_RN64(pixels));\
 755         pixels+=line_size;\
 756         block +=line_size;\
 757     }\
 758 }\
 759 \
 760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 761 {\
 762     int i;\
 763     for(i=0; i<h; i++){\
 764         const uint64_t a= AV_RN64(pixels  );\
 765         const uint64_t b= AV_RN64(pixels+1);\
 766         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 767         pixels+=line_size;\
 768         block +=line_size;\
 769     }\
 770 }\
 771 \
 772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 773 {\
 774     int i;\
 775     for(i=0; i<h; i++){\
 776         const uint64_t a= AV_RN64(pixels  );\
 777         const uint64_t b= AV_RN64(pixels+1);\
 778         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 779         pixels+=line_size;\
 780         block +=line_size;\
 781     }\
 782 }\
 783 \
 784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 785 {\
 786     int i;\
 787     for(i=0; i<h; i++){\
 788         const uint64_t a= AV_RN64(pixels          );\
 789         const uint64_t b= AV_RN64(pixels+line_size);\
 790         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 791         pixels+=line_size;\
 792         block +=line_size;\
 793     }\
 794 }\
 795 \
 796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 797 {\
 798     int i;\
 799     for(i=0; i<h; i++){\
 800         const uint64_t a= AV_RN64(pixels          );\
 801         const uint64_t b= AV_RN64(pixels+line_size);\
 802         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 803         pixels+=line_size;\
 804         block +=line_size;\
 805     }\
 806 }\
 807 \
 808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 809 {\
 810         int i;\
 811         const uint64_t a= AV_RN64(pixels  );\
 812         const uint64_t b= AV_RN64(pixels+1);\
 813         uint64_t l0=  (a&0x0303030303030303ULL)\
 814                     + (b&0x0303030303030303ULL)\
 815                     + 0x0202020202020202ULL;\
 816         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 817                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 818         uint64_t l1,h1;\
 819 \
 820         pixels+=line_size;\
 821         for(i=0; i<h; i+=2){\
 822             uint64_t a= AV_RN64(pixels  );\
 823             uint64_t b= AV_RN64(pixels+1);\
 824             l1=  (a&0x0303030303030303ULL)\
 825                + (b&0x0303030303030303ULL);\
 826             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 827               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 828             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 829             pixels+=line_size;\
 830             block +=line_size;\
 831             a= AV_RN64(pixels  );\
 832             b= AV_RN64(pixels+1);\
 833             l0=  (a&0x0303030303030303ULL)\
 834                + (b&0x0303030303030303ULL)\
 835                + 0x0202020202020202ULL;\
 836             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 837               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 838             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 839             pixels+=line_size;\
 840             block +=line_size;\
 841         }\
 842 }\
 843 \
 844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 845 {\
 846         int i;\
 847         const uint64_t a= AV_RN64(pixels  );\
 848         const uint64_t b= AV_RN64(pixels+1);\
 849         uint64_t l0=  (a&0x0303030303030303ULL)\
 850                     + (b&0x0303030303030303ULL)\
 851                     + 0x0101010101010101ULL;\
 852         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 853                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 854         uint64_t l1,h1;\
 855 \
 856         pixels+=line_size;\
 857         for(i=0; i<h; i+=2){\
 858             uint64_t a= AV_RN64(pixels  );\
 859             uint64_t b= AV_RN64(pixels+1);\
 860             l1=  (a&0x0303030303030303ULL)\
 861                + (b&0x0303030303030303ULL);\
 862             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 863               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 864             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 865             pixels+=line_size;\
 866             block +=line_size;\
 867             a= AV_RN64(pixels  );\
 868             b= AV_RN64(pixels+1);\
 869             l0=  (a&0x0303030303030303ULL)\
 870                + (b&0x0303030303030303ULL)\
 871                + 0x0101010101010101ULL;\
 872             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 873               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 874             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 875             pixels+=line_size;\
 876             block +=line_size;\
 877         }\
 878 }\
 879 \
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 887
 888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 889 #else // 64 bit variant
 890
 891 #define PIXOP2(OPNAME, OP) \
 892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 893     int i;\
 894     for(i=0; i<h; i++){\
 895         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 896         pixels+=line_size;\
 897         block +=line_size;\
 898     }\
 899 }\
 900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 901     int i;\
 902     for(i=0; i<h; i++){\
 903         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 904         pixels+=line_size;\
 905         block +=line_size;\
 906     }\
 907 }\
 908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     int i;\
 910     for(i=0; i<h; i++){\
 911         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 912         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 913         pixels+=line_size;\
 914         block +=line_size;\
 915     }\
 916 }\
 917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 918     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 919 }\
 920 \
 921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 922                                                 int src_stride1, int src_stride2, int h){\
 923     int i;\
 924     for(i=0; i<h; i++){\
 925         uint32_t a,b;\
 926         a= AV_RN32(&src1[i*src_stride1  ]);\
 927         b= AV_RN32(&src2[i*src_stride2  ]);\
 928         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 929         a= AV_RN32(&src1[i*src_stride1+4]);\
 930         b= AV_RN32(&src2[i*src_stride2+4]);\
 931         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 932     }\
 933 }\
 934 \
 935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 936                                                 int src_stride1, int src_stride2, int h){\
 937     int i;\
 938     for(i=0; i<h; i++){\
 939         uint32_t a,b;\
 940         a= AV_RN32(&src1[i*src_stride1  ]);\
 941         b= AV_RN32(&src2[i*src_stride2  ]);\
 942         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 943         a= AV_RN32(&src1[i*src_stride1+4]);\
 944         b= AV_RN32(&src2[i*src_stride2+4]);\
 945         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 946     }\
 947 }\
 948 \
 949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 950                                                 int src_stride1, int src_stride2, int h){\
 951     int i;\
 952     for(i=0; i<h; i++){\
 953         uint32_t a,b;\
 954         a= AV_RN32(&src1[i*src_stride1  ]);\
 955         b= AV_RN32(&src2[i*src_stride2  ]);\
 956         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 957     }\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 961                                                 int src_stride1, int src_stride2, int h){\
 962     int i;\
 963     for(i=0; i<h; i++){\
 964         uint32_t a,b;\
 965         a= AV_RN16(&src1[i*src_stride1  ]);\
 966         b= AV_RN16(&src2[i*src_stride2  ]);\
 967         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 968     }\
 969 }\
 970 \
 971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 972                                                 int src_stride1, int src_stride2, int h){\
 973     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 974     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 975 }\
 976 \
 977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 978                                                 int src_stride1, int src_stride2, int h){\
 979     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 980     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 981 }\
 982 \
 983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 984     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 985 }\
 986 \
 987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 988     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 989 }\
 990 \
 991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 992     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 993 }\
 994 \
 995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 996     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 997 }\
 998 \
 999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001     int i;\
1002     for(i=0; i<h; i++){\
1003         uint32_t a, b, c, d, l0, l1, h0, h1;\
1004         a= AV_RN32(&src1[i*src_stride1]);\
1005         b= AV_RN32(&src2[i*src_stride2]);\
1006         c= AV_RN32(&src3[i*src_stride3]);\
1007         d= AV_RN32(&src4[i*src_stride4]);\
1008         l0=  (a&0x03030303UL)\
1009            + (b&0x03030303UL)\
1010            + 0x02020202UL;\
1011         h0= ((a&0xFCFCFCFCUL)>>2)\
1012           + ((b&0xFCFCFCFCUL)>>2);\
1013         l1=  (c&0x03030303UL)\
1014            + (d&0x03030303UL);\
1015         h1= ((c&0xFCFCFCFCUL)>>2)\
1016           + ((d&0xFCFCFCFCUL)>>2);\
1017         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018         a= AV_RN32(&src1[i*src_stride1+4]);\
1019         b= AV_RN32(&src2[i*src_stride2+4]);\
1020         c= AV_RN32(&src3[i*src_stride3+4]);\
1021         d= AV_RN32(&src4[i*src_stride4+4]);\
1022         l0=  (a&0x03030303UL)\
1023            + (b&0x03030303UL)\
1024            + 0x02020202UL;\
1025         h0= ((a&0xFCFCFCFCUL)>>2)\
1026           + ((b&0xFCFCFCFCUL)>>2);\
1027         l1=  (c&0x03030303UL)\
1028            + (d&0x03030303UL);\
1029         h1= ((c&0xFCFCFCFCUL)>>2)\
1030           + ((d&0xFCFCFCFCUL)>>2);\
1031         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032     }\
1033 }\
1034 \
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037 }\
1038 \
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049 }\
1050 \
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053     int i;\
1054     for(i=0; i<h; i++){\
1055         uint32_t a, b, c, d, l0, l1, h0, h1;\
1056         a= AV_RN32(&src1[i*src_stride1]);\
1057         b= AV_RN32(&src2[i*src_stride2]);\
1058         c= AV_RN32(&src3[i*src_stride3]);\
1059         d= AV_RN32(&src4[i*src_stride4]);\
1060         l0=  (a&0x03030303UL)\
1061            + (b&0x03030303UL)\
1062            + 0x01010101UL;\
1063         h0= ((a&0xFCFCFCFCUL)>>2)\
1064           + ((b&0xFCFCFCFCUL)>>2);\
1065         l1=  (c&0x03030303UL)\
1066            + (d&0x03030303UL);\
1067         h1= ((c&0xFCFCFCFCUL)>>2)\
1068           + ((d&0xFCFCFCFCUL)>>2);\
1069         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070         a= AV_RN32(&src1[i*src_stride1+4]);\
1071         b= AV_RN32(&src2[i*src_stride2+4]);\
1072         c= AV_RN32(&src3[i*src_stride3+4]);\
1073         d= AV_RN32(&src4[i*src_stride4+4]);\
1074         l0=  (a&0x03030303UL)\
1075            + (b&0x03030303UL)\
1076            + 0x01010101UL;\
1077         h0= ((a&0xFCFCFCFCUL)>>2)\
1078           + ((b&0xFCFCFCFCUL)>>2);\
1079         l1=  (c&0x03030303UL)\
1080            + (d&0x03030303UL);\
1081         h1= ((c&0xFCFCFCFCUL)>>2)\
1082           + ((d&0xFCFCFCFCUL)>>2);\
1083         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084     }\
1085 }\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 }\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095 }\
1096 \
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 {\
1099         int i, a0, b0, a1, b1;\
1100         a0= pixels[0];\
1101         b0= pixels[1] + 2;\
1102         a0 += b0;\
1103         b0 += pixels[2];\
1104 \
1105         pixels+=line_size;\
1106         for(i=0; i<h; i+=2){\
1107             a1= pixels[0];\
1108             b1= pixels[1];\
1109             a1 += b1;\
1110             b1 += pixels[2];\
1111 \
1112             block[0]= (a1+a0)>>2; /* FIXME non put */\
1113             block[1]= (b1+b0)>>2;\
1114 \
1115             pixels+=line_size;\
1116             block +=line_size;\
1117 \
1118             a0= pixels[0];\
1119             b0= pixels[1] + 2;\
1120             a0 += b0;\
1121             b0 += pixels[2];\
1122 \
1123             block[0]= (a1+a0)>>2;\
1124             block[1]= (b1+b0)>>2;\
1125             pixels+=line_size;\
1126             block +=line_size;\
1127         }\
1128 }\
1129 \
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 {\
1132         int i;\
1133         const uint32_t a= AV_RN32(pixels  );\
1134         const uint32_t b= AV_RN32(pixels+1);\
1135         uint32_t l0=  (a&0x03030303UL)\
1136                     + (b&0x03030303UL)\
1137                     + 0x02020202UL;\
1138         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139                    + ((b&0xFCFCFCFCUL)>>2);\
1140         uint32_t l1,h1;\
1141 \
1142         pixels+=line_size;\
1143         for(i=0; i<h; i+=2){\
1144             uint32_t a= AV_RN32(pixels  );\
1145             uint32_t b= AV_RN32(pixels+1);\
1146             l1=  (a&0x03030303UL)\
1147                + (b&0x03030303UL);\
1148             h1= ((a&0xFCFCFCFCUL)>>2)\
1149               + ((b&0xFCFCFCFCUL)>>2);\
1150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151             pixels+=line_size;\
1152             block +=line_size;\
1153             a= AV_RN32(pixels  );\
1154             b= AV_RN32(pixels+1);\
1155             l0=  (a&0x03030303UL)\
1156                + (b&0x03030303UL)\
1157                + 0x02020202UL;\
1158             h0= ((a&0xFCFCFCFCUL)>>2)\
1159               + ((b&0xFCFCFCFCUL)>>2);\
1160             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161             pixels+=line_size;\
1162             block +=line_size;\
1163         }\
1164 }\
1165 \
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 {\
1168     int j;\
1169     for(j=0; j<2; j++){\
1170         int i;\
1171         const uint32_t a= AV_RN32(pixels  );\
1172         const uint32_t b= AV_RN32(pixels+1);\
1173         uint32_t l0=  (a&0x03030303UL)\
1174                     + (b&0x03030303UL)\
1175                     + 0x02020202UL;\
1176         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177                    + ((b&0xFCFCFCFCUL)>>2);\
1178         uint32_t l1,h1;\
1179 \
1180         pixels+=line_size;\
1181         for(i=0; i<h; i+=2){\
1182             uint32_t a= AV_RN32(pixels  );\
1183             uint32_t b= AV_RN32(pixels+1);\
1184             l1=  (a&0x03030303UL)\
1185                + (b&0x03030303UL);\
1186             h1= ((a&0xFCFCFCFCUL)>>2)\
1187               + ((b&0xFCFCFCFCUL)>>2);\
1188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189             pixels+=line_size;\
1190             block +=line_size;\
1191             a= AV_RN32(pixels  );\
1192             b= AV_RN32(pixels+1);\
1193             l0=  (a&0x03030303UL)\
1194                + (b&0x03030303UL)\
1195                + 0x02020202UL;\
1196             h0= ((a&0xFCFCFCFCUL)>>2)\
1197               + ((b&0xFCFCFCFCUL)>>2);\
1198             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199             pixels+=line_size;\
1200             block +=line_size;\
1201         }\
1202         pixels+=4-line_size*(h+1);\
1203         block +=4-line_size*h;\
1204     }\
1205 }\
1206 \
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 {\
1209     int j;\
1210     for(j=0; j<2; j++){\
1211         int i;\
1212         const uint32_t a= AV_RN32(pixels  );\
1213         const uint32_t b= AV_RN32(pixels+1);\
1214         uint32_t l0=  (a&0x03030303UL)\
1215                     + (b&0x03030303UL)\
1216                     + 0x01010101UL;\
1217         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218                    + ((b&0xFCFCFCFCUL)>>2);\
1219         uint32_t l1,h1;\
1220 \
1221         pixels+=line_size;\
1222         for(i=0; i<h; i+=2){\
1223             uint32_t a= AV_RN32(pixels  );\
1224             uint32_t b= AV_RN32(pixels+1);\
1225             l1=  (a&0x03030303UL)\
1226                + (b&0x03030303UL);\
1227             h1= ((a&0xFCFCFCFCUL)>>2)\
1228               + ((b&0xFCFCFCFCUL)>>2);\
1229             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230             pixels+=line_size;\
1231             block +=line_size;\
1232             a= AV_RN32(pixels  );\
1233             b= AV_RN32(pixels+1);\
1234             l0=  (a&0x03030303UL)\
1235                + (b&0x03030303UL)\
1236                + 0x01010101UL;\
1237             h0= ((a&0xFCFCFCFCUL)>>2)\
1238               + ((b&0xFCFCFCFCUL)>>2);\
1239             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240             pixels+=line_size;\
1241             block +=line_size;\
1242         }\
1243         pixels+=4-line_size*(h+1);\
1244         block +=4-line_size*h;\
1245     }\
1246 }\
1247 \
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1260
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1265
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271 }
1272
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275 }
1276
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 {
1279     const int A=(16-x16)*(16-y16);
1280     const int B=(   x16)*(16-y16);
1281     const int C=(16-x16)*(   y16);
1282     const int D=(   x16)*(   y16);
1283     int i;
1284
1285     for(i=0; i<h; i++)
1286     {
1287         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295         dst+= stride;
1296         src+= stride;
1297     }
1298 }
1299
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 {
1303     int y, vx, vy;
1304     const int s= 1<<shift;
1305
1306     width--;
1307     height--;
1308
1309     for(y=0; y<h; y++){
1310         int x;
1311
1312         vx= ox;
1313         vy= oy;
1314         for(x=0; x<8; x++){ //XXX FIXME optimize
1315             int src_x, src_y, frac_x, frac_y, index;
1316
1317             src_x= vx>>16;
1318             src_y= vy>>16;
1319             frac_x= src_x&(s-1);
1320             frac_y= src_y&(s-1);
1321             src_x>>=shift;
1322             src_y>>=shift;
1323
1324             if((unsigned)src_x < width){
1325                 if((unsigned)src_y < height){
1326                     index= src_x + src_y*stride;
1327                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328                                            + src[index       +1]*   frac_x )*(s-frac_y)
1329                                         + (  src[index+stride  ]*(s-frac_x)
1330                                            + src[index+stride+1]*   frac_x )*   frac_y
1331                                         + r)>>(shift*2);
1332                 }else{
1333                     index= src_x + av_clip(src_y, 0, height)*stride;
1334                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335                                           + src[index       +1]*   frac_x )*s
1336                                         + r)>>(shift*2);
1337                 }
1338             }else{
1339                 if((unsigned)src_y < height){
1340                     index= av_clip(src_x, 0, width) + src_y*stride;
1341                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342                                            + src[index+stride  ]*   frac_y )*s
1343                                         + r)>>(shift*2);
1344                 }else{
1345                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346                     dst[y*stride + x]=    src[index         ];
1347                 }
1348             }
1349
1350             vx+= dxx;
1351             vy+= dyx;
1352         }
1353         ox += dxy;
1354         oy += dyy;
1355     }
1356 }
1357
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359     switch(width){
1360     case 2: put_pixels2_c (dst, src, stride, height); break;
1361     case 4: put_pixels4_c (dst, src, stride, height); break;
1362     case 8: put_pixels8_c (dst, src, stride, height); break;
1363     case 16:put_pixels16_c(dst, src, stride, height); break;
1364     }
1365 }
1366
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368     int i,j;
1369     for (i=0; i < height; i++) {
1370       for (j=0; j < width; j++) {
1371         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372       }
1373       src += stride;
1374       dst += stride;
1375     }
1376 }
1377
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379     int i,j;
1380     for (i=0; i < height; i++) {
1381       for (j=0; j < width; j++) {
1382         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383       }
1384       src += stride;
1385       dst += stride;
1386     }
1387 }
1388
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390     int i,j;
1391     for (i=0; i < height; i++) {
1392       for (j=0; j < width; j++) {
1393         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394       }
1395       src += stride;
1396       dst += stride;
1397     }
1398 }
1399
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401     int i,j;
1402     for (i=0; i < height; i++) {
1403       for (j=0; j < width; j++) {
1404         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405       }
1406       src += stride;
1407       dst += stride;
1408     }
1409 }
1410
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412     int i,j;
1413     for (i=0; i < height; i++) {
1414       for (j=0; j < width; j++) {
1415         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416       }
1417       src += stride;
1418       dst += stride;
1419     }
1420 }
1421
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423     int i,j;
1424     for (i=0; i < height; i++) {
1425       for (j=0; j < width; j++) {
1426         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427       }
1428       src += stride;
1429       dst += stride;
1430     }
1431 }
1432
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434     int i,j;
1435     for (i=0; i < height; i++) {
1436       for (j=0; j < width; j++) {
1437         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438       }
1439       src += stride;
1440       dst += stride;
1441     }
1442 }
1443
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445     int i,j;
1446     for (i=0; i < height; i++) {
1447       for (j=0; j < width; j++) {
1448         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449       }
1450       src += stride;
1451       dst += stride;
1452     }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456     switch(width){
1457     case 2: avg_pixels2_c (dst, src, stride, height); break;
1458     case 4: avg_pixels4_c (dst, src, stride, height); break;
1459     case 8: avg_pixels8_c (dst, src, stride, height); break;
1460     case 16:avg_pixels16_c(dst, src, stride, height); break;
1461     }
1462 }
1463
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465     int i,j;
1466     for (i=0; i < height; i++) {
1467       for (j=0; j < width; j++) {
1468         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469       }
1470       src += stride;
1471       dst += stride;
1472     }
1473 }
1474
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476     int i,j;
1477     for (i=0; i < height; i++) {
1478       for (j=0; j < width; j++) {
1479         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480       }
1481       src += stride;
1482       dst += stride;
1483     }
1484 }
1485
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487     int i,j;
1488     for (i=0; i < height; i++) {
1489       for (j=0; j < width; j++) {
1490         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491       }
1492       src += stride;
1493       dst += stride;
1494     }
1495 }
1496
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498     int i,j;
1499     for (i=0; i < height; i++) {
1500       for (j=0; j < width; j++) {
1501         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502       }
1503       src += stride;
1504       dst += stride;
1505     }
1506 }
1507
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509     int i,j;
1510     for (i=0; i < height; i++) {
1511       for (j=0; j < width; j++) {
1512         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513       }
1514       src += stride;
1515       dst += stride;
1516     }
1517 }
1518
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520     int i,j;
1521     for (i=0; i < height; i++) {
1522       for (j=0; j < width; j++) {
1523         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524       }
1525       src += stride;
1526       dst += stride;
1527     }
1528 }
1529
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531     int i,j;
1532     for (i=0; i < height; i++) {
1533       for (j=0; j < width; j++) {
1534         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535       }
1536       src += stride;
1537       dst += stride;
1538     }
1539 }
1540
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542     int i,j;
1543     for (i=0; i < height; i++) {
1544       for (j=0; j < width; j++) {
1545         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546       }
1547       src += stride;
1548       dst += stride;
1549     }
1550 }
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1572
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575     const int A=(8-x)*(8-y);\
1576     const int B=(  x)*(8-y);\
1577     const int C=(8-x)*(  y);\
1578     const int D=(  x)*(  y);\
1579     int i;\
1580     \
1581     assert(x<8 && y<8 && x>=0 && y>=0);\
1582 \
1583     if(D){\
1584         for(i=0; i<h; i++){\
1585             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587             dst+= stride;\
1588             src+= stride;\
1589         }\
1590     }else{\
1591         const int E= B+C;\
1592         const int step= C ? stride : 1;\
1593         for(i=0; i<h; i++){\
1594             OP(dst[0], (A*src[0] + E*src[step+0]));\
1595             OP(dst[1], (A*src[1] + E*src[step+1]));\
1596             dst+= stride;\
1597             src+= stride;\
1598         }\
1599     }\
1600 }\
1601 \
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603     const int A=(8-x)*(8-y);\
1604     const int B=(  x)*(8-y);\
1605     const int C=(8-x)*(  y);\
1606     const int D=(  x)*(  y);\
1607     int i;\
1608     \
1609     assert(x<8 && y<8 && x>=0 && y>=0);\
1610 \
1611     if(D){\
1612         for(i=0; i<h; i++){\
1613             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617             dst+= stride;\
1618             src+= stride;\
1619         }\
1620     }else{\
1621         const int E= B+C;\
1622         const int step= C ? stride : 1;\
1623         for(i=0; i<h; i++){\
1624             OP(dst[0], (A*src[0] + E*src[step+0]));\
1625             OP(dst[1], (A*src[1] + E*src[step+1]));\
1626             OP(dst[2], (A*src[2] + E*src[step+2]));\
1627             OP(dst[3], (A*src[3] + E*src[step+3]));\
1628             dst+= stride;\
1629             src+= stride;\
1630         }\
1631     }\
1632 }\
1633 \
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635     const int A=(8-x)*(8-y);\
1636     const int B=(  x)*(8-y);\
1637     const int C=(8-x)*(  y);\
1638     const int D=(  x)*(  y);\
1639     int i;\
1640     \
1641     assert(x<8 && y<8 && x>=0 && y>=0);\
1642 \
1643     if(D){\
1644         for(i=0; i<h; i++){\
1645             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653             dst+= stride;\
1654             src+= stride;\
1655         }\
1656     }else{\
1657         const int E= B+C;\
1658         const int step= C ? stride : 1;\
1659         for(i=0; i<h; i++){\
1660             OP(dst[0], (A*src[0] + E*src[step+0]));\
1661             OP(dst[1], (A*src[1] + E*src[step+1]));\
1662             OP(dst[2], (A*src[2] + E*src[step+2]));\
1663             OP(dst[3], (A*src[3] + E*src[step+3]));\
1664             OP(dst[4], (A*src[4] + E*src[step+4]));\
1665             OP(dst[5], (A*src[5] + E*src[step+5]));\
1666             OP(dst[6], (A*src[6] + E*src[step+6]));\
1667             OP(dst[7], (A*src[7] + E*src[step+7]));\
1668             dst+= stride;\
1669             src+= stride;\
1670         }\
1671     }\
1672 }
1673
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1676
1677 H264_CHROMA_MC(put_       , op_put)
1678 H264_CHROMA_MC(avg_       , op_avg)
1679 #undef op_avg
1680 #undef op_put
1681
1682 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683     const int A=(8-x)*(8-y);
1684     const int B=(  x)*(8-y);
1685     const int C=(8-x)*(  y);
1686     const int D=(  x)*(  y);
1687     int i;
1688
1689     assert(x<8 && y<8 && x>=0 && y>=0);
1690
1691     for(i=0; i<h; i++)
1692     {
1693         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701         dst+= stride;
1702         src+= stride;
1703     }
1704 }
1705
1706 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1707     const int A=(8-x)*(8-y);
1708     const int B=(  x)*(8-y);
1709     const int C=(8-x)*(  y);
1710     const int D=(  x)*(  y);
1711     int i;
1712
1713     assert(x<8 && y<8 && x>=0 && y>=0);
1714
1715     for(i=0; i<h; i++)
1716     {
1717         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1718         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1719         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1720         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1721         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1722         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1723         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1724         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1725         dst+= stride;
1726         src+= stride;
1727     }
1728 }
1729
1730 #define QPEL_MC(r, OPNAME, RND, OP) \
1731 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1732     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1733     int i;\
1734     for(i=0; i<h; i++)\
1735     {\
1736         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1737         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1738         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1739         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1740         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1741         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1742         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1743         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1744         dst+=dstStride;\
1745         src+=srcStride;\
1746     }\
1747 }\
1748 \
1749 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1750     const int w=8;\
1751     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1752     int i;\
1753     for(i=0; i<w; i++)\
1754     {\
1755         const int src0= src[0*srcStride];\
1756         const int src1= src[1*srcStride];\
1757         const int src2= src[2*srcStride];\
1758         const int src3= src[3*srcStride];\
1759         const int src4= src[4*srcStride];\
1760         const int src5= src[5*srcStride];\
1761         const int src6= src[6*srcStride];\
1762         const int src7= src[7*srcStride];\
1763         const int src8= src[8*srcStride];\
1764         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1765         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1766         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1767         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1768         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1769         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1770         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1771         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1772         dst++;\
1773         src++;\
1774     }\
1775 }\
1776 \
1777 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1779     int i;\
1780     \
1781     for(i=0; i<h; i++)\
1782     {\
1783         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1784         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1785         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1786         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1787         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1788         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1789         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1790         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1791         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1792         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1793         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1794         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1795         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1796         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1797         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1798         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1799         dst+=dstStride;\
1800         src+=srcStride;\
1801     }\
1802 }\
1803 \
1804 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1805     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1806     int i;\
1807     const int w=16;\
1808     for(i=0; i<w; i++)\
1809     {\
1810         const int src0= src[0*srcStride];\
1811         const int src1= src[1*srcStride];\
1812         const int src2= src[2*srcStride];\
1813         const int src3= src[3*srcStride];\
1814         const int src4= src[4*srcStride];\
1815         const int src5= src[5*srcStride];\
1816         const int src6= src[6*srcStride];\
1817         const int src7= src[7*srcStride];\
1818         const int src8= src[8*srcStride];\
1819         const int src9= src[9*srcStride];\
1820         const int src10= src[10*srcStride];\
1821         const int src11= src[11*srcStride];\
1822         const int src12= src[12*srcStride];\
1823         const int src13= src[13*srcStride];\
1824         const int src14= src[14*srcStride];\
1825         const int src15= src[15*srcStride];\
1826         const int src16= src[16*srcStride];\
1827         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1828         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1829         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1830         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1831         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1832         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1833         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1834         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1835         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1836         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1837         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1838         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1839         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1840         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1841         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1842         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1843         dst++;\
1844         src++;\
1845     }\
1846 }\
1847 \
1848 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1849     OPNAME ## pixels8_c(dst, src, stride, 8);\
1850 }\
1851 \
1852 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t half[64];\
1854     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1855     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1856 }\
1857 \
1858 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1859     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1860 }\
1861 \
1862 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1863     uint8_t half[64];\
1864     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1865     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1866 }\
1867 \
1868 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1869     uint8_t full[16*9];\
1870     uint8_t half[64];\
1871     copy_block9(full, src, 16, stride, 9);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1873     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1874 }\
1875 \
1876 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[16*9];\
1878     copy_block9(full, src, 16, stride, 9);\
1879     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1880 }\
1881 \
1882 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1883     uint8_t full[16*9];\
1884     uint8_t half[64];\
1885     copy_block9(full, src, 16, stride, 9);\
1886     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1887     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1888 }\
1889 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1890     uint8_t full[16*9];\
1891     uint8_t halfH[72];\
1892     uint8_t halfV[64];\
1893     uint8_t halfHV[64];\
1894     copy_block9(full, src, 16, stride, 9);\
1895     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1896     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1897     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1898     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1899 }\
1900 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1901     uint8_t full[16*9];\
1902     uint8_t halfH[72];\
1903     uint8_t halfHV[64];\
1904     copy_block9(full, src, 16, stride, 9);\
1905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1907     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1909 }\
1910 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1911     uint8_t full[16*9];\
1912     uint8_t halfH[72];\
1913     uint8_t halfV[64];\
1914     uint8_t halfHV[64];\
1915     copy_block9(full, src, 16, stride, 9);\
1916     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1917     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1918     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1919     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1920 }\
1921 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1922     uint8_t full[16*9];\
1923     uint8_t halfH[72];\
1924     uint8_t halfHV[64];\
1925     copy_block9(full, src, 16, stride, 9);\
1926     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1927     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1928     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1929     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1930 }\
1931 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1932     uint8_t full[16*9];\
1933     uint8_t halfH[72];\
1934     uint8_t halfV[64];\
1935     uint8_t halfHV[64];\
1936     copy_block9(full, src, 16, stride, 9);\
1937     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1938     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1939     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1940     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1941 }\
1942 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1943     uint8_t full[16*9];\
1944     uint8_t halfH[72];\
1945     uint8_t halfHV[64];\
1946     copy_block9(full, src, 16, stride, 9);\
1947     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1948     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1949     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1951 }\
1952 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1953     uint8_t full[16*9];\
1954     uint8_t halfH[72];\
1955     uint8_t halfV[64];\
1956     uint8_t halfHV[64];\
1957     copy_block9(full, src, 16, stride, 9);\
1958     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1959     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1960     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1962 }\
1963 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1964     uint8_t full[16*9];\
1965     uint8_t halfH[72];\
1966     uint8_t halfHV[64];\
1967     copy_block9(full, src, 16, stride, 9);\
1968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1972 }\
1973 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1974     uint8_t halfH[72];\
1975     uint8_t halfHV[64];\
1976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1978     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1979 }\
1980 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1981     uint8_t halfH[72];\
1982     uint8_t halfHV[64];\
1983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1984     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1985     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1986 }\
1987 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1988     uint8_t full[16*9];\
1989     uint8_t halfH[72];\
1990     uint8_t halfV[64];\
1991     uint8_t halfHV[64];\
1992     copy_block9(full, src, 16, stride, 9);\
1993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1995     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1997 }\
1998 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1999     uint8_t full[16*9];\
2000     uint8_t halfH[72];\
2001     copy_block9(full, src, 16, stride, 9);\
2002     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2004     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 }\
2006 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007     uint8_t full[16*9];\
2008     uint8_t halfH[72];\
2009     uint8_t halfV[64];\
2010     uint8_t halfHV[64];\
2011     copy_block9(full, src, 16, stride, 9);\
2012     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2014     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2016 }\
2017 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2018     uint8_t full[16*9];\
2019     uint8_t halfH[72];\
2020     copy_block9(full, src, 16, stride, 9);\
2021     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2022     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2023     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2024 }\
2025 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2026     uint8_t halfH[72];\
2027     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2028     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2029 }\
2030 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2031     OPNAME ## pixels16_c(dst, src, stride, 16);\
2032 }\
2033 \
2034 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2035     uint8_t half[256];\
2036     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2037     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2038 }\
2039 \
2040 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2041     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2042 }\
2043 \
2044 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2045     uint8_t half[256];\
2046     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2047     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2048 }\
2049 \
2050 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2051     uint8_t full[24*17];\
2052     uint8_t half[256];\
2053     copy_block17(full, src, 24, stride, 17);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2055     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2056 }\
2057 \
2058 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2059     uint8_t full[24*17];\
2060     copy_block17(full, src, 24, stride, 17);\
2061     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2062 }\
2063 \
2064 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2065     uint8_t full[24*17];\
2066     uint8_t half[256];\
2067     copy_block17(full, src, 24, stride, 17);\
2068     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2069     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2070 }\
2071 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072     uint8_t full[24*17];\
2073     uint8_t halfH[272];\
2074     uint8_t halfV[256];\
2075     uint8_t halfHV[256];\
2076     copy_block17(full, src, 24, stride, 17);\
2077     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2078     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2079     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2080     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2081 }\
2082 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2083     uint8_t full[24*17];\
2084     uint8_t halfH[272];\
2085     uint8_t halfHV[256];\
2086     copy_block17(full, src, 24, stride, 17);\
2087     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2089     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2091 }\
2092 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2093     uint8_t full[24*17];\
2094     uint8_t halfH[272];\
2095     uint8_t halfV[256];\
2096     uint8_t halfHV[256];\
2097     copy_block17(full, src, 24, stride, 17);\
2098     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2100     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2101     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2102 }\
2103 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2104     uint8_t full[24*17];\
2105     uint8_t halfH[272];\
2106     uint8_t halfHV[256];\
2107     copy_block17(full, src, 24, stride, 17);\
2108     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2110     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2112 }\
2113 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2114     uint8_t full[24*17];\
2115     uint8_t halfH[272];\
2116     uint8_t halfV[256];\
2117     uint8_t halfHV[256];\
2118     copy_block17(full, src, 24, stride, 17);\
2119     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2120     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2121     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2122     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2123 }\
2124 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2125     uint8_t full[24*17];\
2126     uint8_t halfH[272];\
2127     uint8_t halfHV[256];\
2128     copy_block17(full, src, 24, stride, 17);\
2129     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2130     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2131     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2133 }\
2134 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2135     uint8_t full[24*17];\
2136     uint8_t halfH[272];\
2137     uint8_t halfV[256];\
2138     uint8_t halfHV[256];\
2139     copy_block17(full, src, 24, stride, 17);\
2140     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2141     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2142     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2144 }\
2145 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2146     uint8_t full[24*17];\
2147     uint8_t halfH[272];\
2148     uint8_t halfHV[256];\
2149     copy_block17(full, src, 24, stride, 17);\
2150     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2152     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2154 }\
2155 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2156     uint8_t halfH[272];\
2157     uint8_t halfHV[256];\
2158     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2160     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2161 }\
2162 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2163     uint8_t halfH[272];\
2164     uint8_t halfHV[256];\
2165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2167     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2168 }\
2169 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2170     uint8_t full[24*17];\
2171     uint8_t halfH[272];\
2172     uint8_t halfV[256];\
2173     uint8_t halfHV[256];\
2174     copy_block17(full, src, 24, stride, 17);\
2175     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2177     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2179 }\
2180 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2181     uint8_t full[24*17];\
2182     uint8_t halfH[272];\
2183     copy_block17(full, src, 24, stride, 17);\
2184     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2186     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187 }\
2188 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2189     uint8_t full[24*17];\
2190     uint8_t halfH[272];\
2191     uint8_t halfV[256];\
2192     uint8_t halfHV[256];\
2193     copy_block17(full, src, 24, stride, 17);\
2194     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2196     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2198 }\
2199 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2200     uint8_t full[24*17];\
2201     uint8_t halfH[272];\
2202     copy_block17(full, src, 24, stride, 17);\
2203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2204     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2205     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2206 }\
2207 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2208     uint8_t halfH[272];\
2209     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2210     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2211 }
2212
2213 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2214 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2215 #define op_put(a, b) a = cm[((b) + 16)>>5]
2216 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2217
2218 QPEL_MC(0, put_       , _       , op_put)
2219 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2220 QPEL_MC(0, avg_       , _       , op_avg)
2221 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2222 #undef op_avg
2223 #undef op_avg_no_rnd
2224 #undef op_put
2225 #undef op_put_no_rnd
2226
2227 #if 1
2228 #define H264_LOWPASS(OPNAME, OP, OP2) \
2229 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2230     const int h=2;\
2231     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2232     int i;\
2233     for(i=0; i<h; i++)\
2234     {\
2235         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2236         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2237         dst+=dstStride;\
2238         src+=srcStride;\
2239     }\
2240 }\
2241 \
2242 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2243     const int w=2;\
2244     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245     int i;\
2246     for(i=0; i<w; i++)\
2247     {\
2248         const int srcB= src[-2*srcStride];\
2249         const int srcA= src[-1*srcStride];\
2250         const int src0= src[0 *srcStride];\
2251         const int src1= src[1 *srcStride];\
2252         const int src2= src[2 *srcStride];\
2253         const int src3= src[3 *srcStride];\
2254         const int src4= src[4 *srcStride];\
2255         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2256         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2257         dst++;\
2258         src++;\
2259     }\
2260 }\
2261 \
2262 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2263     const int h=2;\
2264     const int w=2;\
2265     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266     int i;\
2267     src -= 2*srcStride;\
2268     for(i=0; i<h+5; i++)\
2269     {\
2270         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2271         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2272         tmp+=tmpStride;\
2273         src+=srcStride;\
2274     }\
2275     tmp -= tmpStride*(h+5-2);\
2276     for(i=0; i<w; i++)\
2277     {\
2278         const int tmpB= tmp[-2*tmpStride];\
2279         const int tmpA= tmp[-1*tmpStride];\
2280         const int tmp0= tmp[0 *tmpStride];\
2281         const int tmp1= tmp[1 *tmpStride];\
2282         const int tmp2= tmp[2 *tmpStride];\
2283         const int tmp3= tmp[3 *tmpStride];\
2284         const int tmp4= tmp[4 *tmpStride];\
2285         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2286         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2287         dst++;\
2288         tmp++;\
2289     }\
2290 }\
2291 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2292     const int h=4;\
2293     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294     int i;\
2295     for(i=0; i<h; i++)\
2296     {\
2297         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2298         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2299         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2300         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2301         dst+=dstStride;\
2302         src+=srcStride;\
2303     }\
2304 }\
2305 \
2306 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2307     const int w=4;\
2308     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309     int i;\
2310     for(i=0; i<w; i++)\
2311     {\
2312         const int srcB= src[-2*srcStride];\
2313         const int srcA= src[-1*srcStride];\
2314         const int src0= src[0 *srcStride];\
2315         const int src1= src[1 *srcStride];\
2316         const int src2= src[2 *srcStride];\
2317         const int src3= src[3 *srcStride];\
2318         const int src4= src[4 *srcStride];\
2319         const int src5= src[5 *srcStride];\
2320         const int src6= src[6 *srcStride];\
2321         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2322         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2323         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2324         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2325         dst++;\
2326         src++;\
2327     }\
2328 }\
2329 \
2330 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2331     const int h=4;\
2332     const int w=4;\
2333     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2334     int i;\
2335     src -= 2*srcStride;\
2336     for(i=0; i<h+5; i++)\
2337     {\
2338         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2339         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2340         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2341         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2342         tmp+=tmpStride;\
2343         src+=srcStride;\
2344     }\
2345     tmp -= tmpStride*(h+5-2);\
2346     for(i=0; i<w; i++)\
2347     {\
2348         const int tmpB= tmp[-2*tmpStride];\
2349         const int tmpA= tmp[-1*tmpStride];\
2350         const int tmp0= tmp[0 *tmpStride];\
2351         const int tmp1= tmp[1 *tmpStride];\
2352         const int tmp2= tmp[2 *tmpStride];\
2353         const int tmp3= tmp[3 *tmpStride];\
2354         const int tmp4= tmp[4 *tmpStride];\
2355         const int tmp5= tmp[5 *tmpStride];\
2356         const int tmp6= tmp[6 *tmpStride];\
2357         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2358         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2359         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2360         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2361         dst++;\
2362         tmp++;\
2363     }\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2367     const int h=8;\
2368     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2369     int i;\
2370     for(i=0; i<h; i++)\
2371     {\
2372         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2373         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2374         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2375         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2376         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2377         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2378         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2379         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2380         dst+=dstStride;\
2381         src+=srcStride;\
2382     }\
2383 }\
2384 \
2385 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2386     const int w=8;\
2387     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2388     int i;\
2389     for(i=0; i<w; i++)\
2390     {\
2391         const int srcB= src[-2*srcStride];\
2392         const int srcA= src[-1*srcStride];\
2393         const int src0= src[0 *srcStride];\
2394         const int src1= src[1 *srcStride];\
2395         const int src2= src[2 *srcStride];\
2396         const int src3= src[3 *srcStride];\
2397         const int src4= src[4 *srcStride];\
2398         const int src5= src[5 *srcStride];\
2399         const int src6= src[6 *srcStride];\
2400         const int src7= src[7 *srcStride];\
2401         const int src8= src[8 *srcStride];\
2402         const int src9= src[9 *srcStride];\
2403         const int src10=src[10*srcStride];\
2404         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2405         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2406         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2407         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2408         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2409         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2410         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2411         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2412         dst++;\
2413         src++;\
2414     }\
2415 }\
2416 \
2417 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2418     const int h=8;\
2419     const int w=8;\
2420     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2421     int i;\
2422     src -= 2*srcStride;\
2423     for(i=0; i<h+5; i++)\
2424     {\
2425         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2426         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2427         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2428         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2429         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2430         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2431         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2432         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2433         tmp+=tmpStride;\
2434         src+=srcStride;\
2435     }\
2436     tmp -= tmpStride*(h+5-2);\
2437     for(i=0; i<w; i++)\
2438     {\
2439         const int tmpB= tmp[-2*tmpStride];\
2440         const int tmpA= tmp[-1*tmpStride];\
2441         const int tmp0= tmp[0 *tmpStride];\
2442         const int tmp1= tmp[1 *tmpStride];\
2443         const int tmp2= tmp[2 *tmpStride];\
2444         const int tmp3= tmp[3 *tmpStride];\
2445         const int tmp4= tmp[4 *tmpStride];\
2446         const int tmp5= tmp[5 *tmpStride];\
2447         const int tmp6= tmp[6 *tmpStride];\
2448         const int tmp7= tmp[7 *tmpStride];\
2449         const int tmp8= tmp[8 *tmpStride];\
2450         const int tmp9= tmp[9 *tmpStride];\
2451         const int tmp10=tmp[10*tmpStride];\
2452         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2453         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2454         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2455         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2456         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2457         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2458         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2459         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2460         dst++;\
2461         tmp++;\
2462     }\
2463 }\
2464 \
2465 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2466     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2467     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2468     src += 8*srcStride;\
2469     dst += 8*dstStride;\
2470     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2471     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2472 }\
2473 \
2474 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2475     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2476     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2477     src += 8*srcStride;\
2478     dst += 8*dstStride;\
2479     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2480     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2481 }\
2482 \
2483 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2484     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2485     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2486     src += 8*srcStride;\
2487     dst += 8*dstStride;\
2488     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2489     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2490 }\
2491
2492 #define H264_MC(OPNAME, SIZE) \
2493 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2494     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2495 }\
2496 \
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2498     uint8_t half[SIZE*SIZE];\
2499     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2500     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2501 }\
2502 \
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2504     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2505 }\
2506 \
2507 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2508     uint8_t half[SIZE*SIZE];\
2509     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2510     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2511 }\
2512 \
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2514     uint8_t full[SIZE*(SIZE+5)];\
2515     uint8_t * const full_mid= full + SIZE*2;\
2516     uint8_t half[SIZE*SIZE];\
2517     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2518     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2519     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2520 }\
2521 \
2522 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2523     uint8_t full[SIZE*(SIZE+5)];\
2524     uint8_t * const full_mid= full + SIZE*2;\
2525     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2526     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2527 }\
2528 \
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2530     uint8_t full[SIZE*(SIZE+5)];\
2531     uint8_t * const full_mid= full + SIZE*2;\
2532     uint8_t half[SIZE*SIZE];\
2533     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2534     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2535     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2536 }\
2537 \
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2539     uint8_t full[SIZE*(SIZE+5)];\
2540     uint8_t * const full_mid= full + SIZE*2;\
2541     uint8_t halfH[SIZE*SIZE];\
2542     uint8_t halfV[SIZE*SIZE];\
2543     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2544     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2545     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547 }\
2548 \
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2550     uint8_t full[SIZE*(SIZE+5)];\
2551     uint8_t * const full_mid= full + SIZE*2;\
2552     uint8_t halfH[SIZE*SIZE];\
2553     uint8_t halfV[SIZE*SIZE];\
2554     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2555     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2556     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2557     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2558 }\
2559 \
2560 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2561     uint8_t full[SIZE*(SIZE+5)];\
2562     uint8_t * const full_mid= full + SIZE*2;\
2563     uint8_t halfH[SIZE*SIZE];\
2564     uint8_t halfV[SIZE*SIZE];\
2565     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2566     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2567     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2568     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2569 }\
2570 \
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2572     uint8_t full[SIZE*(SIZE+5)];\
2573     uint8_t * const full_mid= full + SIZE*2;\
2574     uint8_t halfH[SIZE*SIZE];\
2575     uint8_t halfV[SIZE*SIZE];\
2576     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2578     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2579     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2580 }\
2581 \
2582 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2583     int16_t tmp[SIZE*(SIZE+5)];\
2584     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2585 }\
2586 \
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2588     int16_t tmp[SIZE*(SIZE+5)];\
2589     uint8_t halfH[SIZE*SIZE];\
2590     uint8_t halfHV[SIZE*SIZE];\
2591     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2592     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2594 }\
2595 \
2596 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2597     int16_t tmp[SIZE*(SIZE+5)];\
2598     uint8_t halfH[SIZE*SIZE];\
2599     uint8_t halfHV[SIZE*SIZE];\
2600     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2601     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2603 }\
2604 \
2605 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2606     uint8_t full[SIZE*(SIZE+5)];\
2607     uint8_t * const full_mid= full + SIZE*2;\
2608     int16_t tmp[SIZE*(SIZE+5)];\
2609     uint8_t halfV[SIZE*SIZE];\
2610     uint8_t halfHV[SIZE*SIZE];\
2611     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2612     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2613     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2614     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2615 }\
2616 \
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2618     uint8_t full[SIZE*(SIZE+5)];\
2619     uint8_t * const full_mid= full + SIZE*2;\
2620     int16_t tmp[SIZE*(SIZE+5)];\
2621     uint8_t halfV[SIZE*SIZE];\
2622     uint8_t halfHV[SIZE*SIZE];\
2623     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2624     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2626     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2627 }\
2628
2629 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2630 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2631 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2632 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2633 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2634
2635 H264_LOWPASS(put_       , op_put, op2_put)
2636 H264_LOWPASS(avg_       , op_avg, op2_avg)
2637 H264_MC(put_, 2)
2638 H264_MC(put_, 4)
2639 H264_MC(put_, 8)
2640 H264_MC(put_, 16)
2641 H264_MC(avg_, 4)
2642 H264_MC(avg_, 8)
2643 H264_MC(avg_, 16)
2644
2645 #undef op_avg
2646 #undef op_put
2647 #undef op2_avg
2648 #undef op2_put
2649 #endif
2650
2651 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2652 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2653 #define H264_WEIGHT(W,H) \
2654 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2655     int y; \
2656     offset <<= log2_denom; \
2657     if(log2_denom) offset += 1<<(log2_denom-1); \
2658     for(y=0; y<H; y++, block += stride){ \
2659         op_scale1(0); \
2660         op_scale1(1); \
2661         if(W==2) continue; \
2662         op_scale1(2); \
2663         op_scale1(3); \
2664         if(W==4) continue; \
2665         op_scale1(4); \
2666         op_scale1(5); \
2667         op_scale1(6); \
2668         op_scale1(7); \
2669         if(W==8) continue; \
2670         op_scale1(8); \
2671         op_scale1(9); \
2672         op_scale1(10); \
2673         op_scale1(11); \
2674         op_scale1(12); \
2675         op_scale1(13); \
2676         op_scale1(14); \
2677         op_scale1(15); \
2678     } \
2679 } \
2680 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2681     int y; \
2682     offset = ((offset + 1) | 1) << log2_denom; \
2683     for(y=0; y<H; y++, dst += stride, src += stride){ \
2684         op_scale2(0); \
2685         op_scale2(1); \
2686         if(W==2) continue; \
2687         op_scale2(2); \
2688         op_scale2(3); \
2689         if(W==4) continue; \
2690         op_scale2(4); \
2691         op_scale2(5); \
2692         op_scale2(6); \
2693         op_scale2(7); \
2694         if(W==8) continue; \
2695         op_scale2(8); \
2696         op_scale2(9); \
2697         op_scale2(10); \
2698         op_scale2(11); \
2699         op_scale2(12); \
2700         op_scale2(13); \
2701         op_scale2(14); \
2702         op_scale2(15); \
2703     } \
2704 }
2705
2706 H264_WEIGHT(16,16)
2707 H264_WEIGHT(16,8)
2708 H264_WEIGHT(8,16)
2709 H264_WEIGHT(8,8)
2710 H264_WEIGHT(8,4)
2711 H264_WEIGHT(4,8)
2712 H264_WEIGHT(4,4)
2713 H264_WEIGHT(4,2)
2714 H264_WEIGHT(2,4)
2715 H264_WEIGHT(2,2)
2716
2717 #undef op_scale1
2718 #undef op_scale2
2719 #undef H264_WEIGHT
2720
2721 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2722     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2723     int i;
2724
2725     for(i=0; i<h; i++){
2726         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2727         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2728         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2729         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2730         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2731         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2732         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2733         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2734         dst+=dstStride;
2735         src+=srcStride;
2736     }
2737 }
2738
2739 #if CONFIG_CAVS_DECODER
2740 /* AVS specific */
2741 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2742
2743 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2744     put_pixels8_c(dst, src, stride, 8);
2745 }
2746 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747     avg_pixels8_c(dst, src, stride, 8);
2748 }
2749 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750     put_pixels16_c(dst, src, stride, 16);
2751 }
2752 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753     avg_pixels16_c(dst, src, stride, 16);
2754 }
2755 #endif /* CONFIG_CAVS_DECODER */
2756
2757 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2758
2759 #if CONFIG_VC1_DECODER
2760 /* VC-1 specific */
2761 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2762
2763 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2764     put_pixels8_c(dst, src, stride, 8);
2765 }
2766 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767     avg_pixels8_c(dst, src, stride, 8);
2768 }
2769 #endif /* CONFIG_VC1_DECODER */
2770
2771 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2772
2773 /* H264 specific */
2774 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2775
2776 #if CONFIG_RV30_DECODER
2777 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2778 #endif /* CONFIG_RV30_DECODER */
2779
2780 #if CONFIG_RV40_DECODER
2781 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2782     put_pixels16_xy2_c(dst, src, stride, 16);
2783 }
2784 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785     avg_pixels16_xy2_c(dst, src, stride, 16);
2786 }
2787 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788     put_pixels8_xy2_c(dst, src, stride, 8);
2789 }
2790 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791     avg_pixels8_xy2_c(dst, src, stride, 8);
2792 }
2793
2794 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2795 #endif /* CONFIG_RV40_DECODER */
2796
2797 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2798     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2799     int i;
2800
2801     for(i=0; i<w; i++){
2802         const int src_1= src[ -srcStride];
2803         const int src0 = src[0          ];
2804         const int src1 = src[  srcStride];
2805         const int src2 = src[2*srcStride];
2806         const int src3 = src[3*srcStride];
2807         const int src4 = src[4*srcStride];
2808         const int src5 = src[5*srcStride];
2809         const int src6 = src[6*srcStride];
2810         const int src7 = src[7*srcStride];
2811         const int src8 = src[8*srcStride];
2812         const int src9 = src[9*srcStride];
2813         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2814         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2815         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2816         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2817         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2818         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2819         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2820         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2821         src++;
2822         dst++;
2823     }
2824 }
2825
2826 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2827     put_pixels8_c(dst, src, stride, 8);
2828 }
2829
2830 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2831     uint8_t half[64];
2832     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2833     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2834 }
2835
2836 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2837     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2838 }
2839
2840 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2841     uint8_t half[64];
2842     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2843     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2844 }
2845
2846 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2847     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2848 }
2849
2850 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2851     uint8_t halfH[88];
2852     uint8_t halfV[64];
2853     uint8_t halfHV[64];
2854     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2855     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2856     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2857     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2858 }
2859 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2860     uint8_t halfH[88];
2861     uint8_t halfV[64];
2862     uint8_t halfHV[64];
2863     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2864     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2865     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2866     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2867 }
2868 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2869     uint8_t halfH[88];
2870     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2871     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2872 }
2873
2874 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2875     if(CONFIG_ANY_H263) {
2876     int x;
2877     const int strength= ff_h263_loop_filter_strength[qscale];
2878
2879     for(x=0; x<8; x++){
2880         int d1, d2, ad1;
2881         int p0= src[x-2*stride];
2882         int p1= src[x-1*stride];
2883         int p2= src[x+0*stride];
2884         int p3= src[x+1*stride];
2885         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2886
2887         if     (d<-2*strength) d1= 0;
2888         else if(d<-  strength) d1=-2*strength - d;
2889         else if(d<   strength) d1= d;
2890         else if(d< 2*strength) d1= 2*strength - d;
2891         else                   d1= 0;
2892
2893         p1 += d1;
2894         p2 -= d1;
2895         if(p1&256) p1= ~(p1>>31);
2896         if(p2&256) p2= ~(p2>>31);
2897
2898         src[x-1*stride] = p1;
2899         src[x+0*stride] = p2;
2900
2901         ad1= FFABS(d1)>>1;
2902
2903         d2= av_clip((p0-p3)/4, -ad1, ad1);
2904
2905         src[x-2*stride] = p0 - d2;
2906         src[x+  stride] = p3 + d2;
2907     }
2908     }
2909 }
2910
2911 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2912     if(CONFIG_ANY_H263) {
2913     int y;
2914     const int strength= ff_h263_loop_filter_strength[qscale];
2915
2916     for(y=0; y<8; y++){
2917         int d1, d2, ad1;
2918         int p0= src[y*stride-2];
2919         int p1= src[y*stride-1];
2920         int p2= src[y*stride+0];
2921         int p3= src[y*stride+1];
2922         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2923
2924         if     (d<-2*strength) d1= 0;
2925         else if(d<-  strength) d1=-2*strength - d;
2926         else if(d<   strength) d1= d;
2927         else if(d< 2*strength) d1= 2*strength - d;
2928         else                   d1= 0;
2929
2930         p1 += d1;
2931         p2 -= d1;
2932         if(p1&256) p1= ~(p1>>31);
2933         if(p2&256) p2= ~(p2>>31);
2934
2935         src[y*stride-1] = p1;
2936         src[y*stride+0] = p2;
2937
2938         ad1= FFABS(d1)>>1;
2939
2940         d2= av_clip((p0-p3)/4, -ad1, ad1);
2941
2942         src[y*stride-2] = p0 - d2;
2943         src[y*stride+1] = p3 + d2;
2944     }
2945     }
2946 }
2947
2948 static void h261_loop_filter_c(uint8_t *src, int stride){
2949     int x,y,xy,yz;
2950     int temp[64];
2951
2952     for(x=0; x<8; x++){
2953         temp[x      ] = 4*src[x           ];
2954         temp[x + 7*8] = 4*src[x + 7*stride];
2955     }
2956     for(y=1; y<7; y++){
2957         for(x=0; x<8; x++){
2958             xy = y * stride + x;
2959             yz = y * 8 + x;
2960             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2961         }
2962     }
2963
2964     for(y=0; y<8; y++){
2965         src[  y*stride] = (temp[  y*8] + 2)>>2;
2966         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2967         for(x=1; x<7; x++){
2968             xy = y * stride + x;
2969             yz = y * 8 + x;
2970             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2971         }
2972     }
2973 }
2974
2975 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2976 {
2977     int i, d;
2978     for( i = 0; i < 4; i++ ) {
2979         if( tc0[i] < 0 ) {
2980             pix += 4*ystride;
2981             continue;
2982         }
2983         for( d = 0; d < 4; d++ ) {
2984             const int p0 = pix[-1*xstride];
2985             const int p1 = pix[-2*xstride];
2986             const int p2 = pix[-3*xstride];
2987             const int q0 = pix[0];
2988             const int q1 = pix[1*xstride];
2989             const int q2 = pix[2*xstride];
2990
2991             if( FFABS( p0 - q0 ) < alpha &&
2992                 FFABS( p1 - p0 ) < beta &&
2993                 FFABS( q1 - q0 ) < beta ) {
2994
2995                 int tc = tc0[i];
2996                 int i_delta;
2997
2998                 if( FFABS( p2 - p0 ) < beta ) {
2999                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3000                     tc++;
3001                 }
3002                 if( FFABS( q2 - q0 ) < beta ) {
3003                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3004                     tc++;
3005                 }
3006
3007                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3008                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
3009                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
3010             }
3011             pix += ystride;
3012         }
3013     }
3014 }
3015 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3016 {
3017     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3018 }
3019 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3020 {
3021     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3022 }
3023
3024 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3025 {
3026     int d;
3027     for( d = 0; d < 16; d++ ) {
3028         const int p2 = pix[-3*xstride];
3029         const int p1 = pix[-2*xstride];
3030         const int p0 = pix[-1*xstride];
3031
3032         const int q0 = pix[ 0*xstride];
3033         const int q1 = pix[ 1*xstride];
3034         const int q2 = pix[ 2*xstride];
3035
3036         if( FFABS( p0 - q0 ) < alpha &&
3037             FFABS( p1 - p0 ) < beta &&
3038             FFABS( q1 - q0 ) < beta ) {
3039
3040             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3041                 if( FFABS( p2 - p0 ) < beta)
3042                 {
3043                     const int p3 = pix[-4*xstride];
3044                     /* p0', p1', p2' */
3045                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3046                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3047                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3048                 } else {
3049                     /* p0' */
3050                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3051                 }
3052                 if( FFABS( q2 - q0 ) < beta)
3053                 {
3054                     const int q3 = pix[3*xstride];
3055                     /* q0', q1', q2' */
3056                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3057                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3058                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3059                 } else {
3060                     /* q0' */
3061                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3062                 }
3063             }else{
3064                 /* p0', q0' */
3065                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3066                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3067             }
3068         }
3069         pix += ystride;
3070     }
3071 }
3072 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3073 {
3074     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3075 }
3076 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3077 {
3078     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3079 }
3080
3081 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3082 {
3083     int i, d;
3084     for( i = 0; i < 4; i++ ) {
3085         const int tc = tc0[i];
3086         if( tc <= 0 ) {
3087             pix += 2*ystride;
3088             continue;
3089         }
3090         for( d = 0; d < 2; d++ ) {
3091             const int p0 = pix[-1*xstride];
3092             const int p1 = pix[-2*xstride];
3093             const int q0 = pix[0];
3094             const int q1 = pix[1*xstride];
3095
3096             if( FFABS( p0 - q0 ) < alpha &&
3097                 FFABS( p1 - p0 ) < beta &&
3098                 FFABS( q1 - q0 ) < beta ) {
3099
3100                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3101
3102                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3103                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3104             }
3105             pix += ystride;
3106         }
3107     }
3108 }
3109 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3110 {
3111     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3112 }
3113 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3114 {
3115     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3116 }
3117
3118 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3119 {
3120     int d;
3121     for( d = 0; d < 8; d++ ) {
3122         const int p0 = pix[-1*xstride];
3123         const int p1 = pix[-2*xstride];
3124         const int q0 = pix[0];
3125         const int q1 = pix[1*xstride];
3126
3127         if( FFABS( p0 - q0 ) < alpha &&
3128             FFABS( p1 - p0 ) < beta &&
3129             FFABS( q1 - q0 ) < beta ) {
3130
3131             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3132             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3133         }
3134         pix += ystride;
3135     }
3136 }
3137 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3138 {
3139     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3140 }
3141 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3142 {
3143     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3144 }
3145
3146 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3147 {
3148     int s, i;
3149
3150     s = 0;
3151     for(i=0;i<h;i++) {
3152         s += abs(pix1[0] - pix2[0]);
3153         s += abs(pix1[1] - pix2[1]);
3154         s += abs(pix1[2] - pix2[2]);
3155         s += abs(pix1[3] - pix2[3]);
3156         s += abs(pix1[4] - pix2[4]);
3157         s += abs(pix1[5] - pix2[5]);
3158         s += abs(pix1[6] - pix2[6]);
3159         s += abs(pix1[7] - pix2[7]);
3160         s += abs(pix1[8] - pix2[8]);
3161         s += abs(pix1[9] - pix2[9]);
3162         s += abs(pix1[10] - pix2[10]);
3163         s += abs(pix1[11] - pix2[11]);
3164         s += abs(pix1[12] - pix2[12]);
3165         s += abs(pix1[13] - pix2[13]);
3166         s += abs(pix1[14] - pix2[14]);
3167         s += abs(pix1[15] - pix2[15]);
3168         pix1 += line_size;
3169         pix2 += line_size;
3170     }
3171     return s;
3172 }
3173
3174 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3175 {
3176     int s, i;
3177
3178     s = 0;
3179     for(i=0;i<h;i++) {
3180         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3181         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3182         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3183         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3184         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3185         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3186         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3187         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3188         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3189         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3190         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3191         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3192         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3193         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3194         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3195         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3196         pix1 += line_size;
3197         pix2 += line_size;
3198     }
3199     return s;
3200 }
3201
3202 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3203 {
3204     int s, i;
3205     uint8_t *pix3 = pix2 + line_size;
3206
3207     s = 0;
3208     for(i=0;i<h;i++) {
3209         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3210         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3211         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3212         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3213         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3214         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3215         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3216         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3217         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3218         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3219         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3220         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3221         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3222         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3223         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3224         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3225         pix1 += line_size;
3226         pix2 += line_size;
3227         pix3 += line_size;
3228     }
3229     return s;
3230 }
3231
3232 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3233 {
3234     int s, i;
3235     uint8_t *pix3 = pix2 + line_size;
3236
3237     s = 0;
3238     for(i=0;i<h;i++) {
3239         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3240         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3241         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3242         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3243         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3244         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3245         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3246         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3247         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3248         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3249         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3250         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3251         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3252         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3253         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3254         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3255         pix1 += line_size;
3256         pix2 += line_size;
3257         pix3 += line_size;
3258     }
3259     return s;
3260 }
3261
3262 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3263 {
3264     int s, i;
3265
3266     s = 0;
3267     for(i=0;i<h;i++) {
3268         s += abs(pix1[0] - pix2[0]);
3269         s += abs(pix1[1] - pix2[1]);
3270         s += abs(pix1[2] - pix2[2]);
3271         s += abs(pix1[3] - pix2[3]);
3272         s += abs(pix1[4] - pix2[4]);
3273         s += abs(pix1[5] - pix2[5]);
3274         s += abs(pix1[6] - pix2[6]);
3275         s += abs(pix1[7] - pix2[7]);
3276         pix1 += line_size;
3277         pix2 += line_size;
3278     }
3279     return s;
3280 }
3281
3282 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3283 {
3284     int s, i;
3285
3286     s = 0;
3287     for(i=0;i<h;i++) {
3288         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3289         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3290         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3291         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3292         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3293         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3294         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3295         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3296         pix1 += line_size;
3297         pix2 += line_size;
3298     }
3299     return s;
3300 }
3301
3302 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3303 {
3304     int s, i;
3305     uint8_t *pix3 = pix2 + line_size;
3306
3307     s = 0;
3308     for(i=0;i<h;i++) {
3309         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3310         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3311         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3312         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3313         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3314         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3315         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3316         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3317         pix1 += line_size;
3318         pix2 += line_size;
3319         pix3 += line_size;
3320     }
3321     return s;
3322 }
3323
3324 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3325 {
3326     int s, i;
3327     uint8_t *pix3 = pix2 + line_size;
3328
3329     s = 0;
3330     for(i=0;i<h;i++) {
3331         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3332         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3333         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3334         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3335         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3336         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3337         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3338         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3339         pix1 += line_size;
3340         pix2 += line_size;
3341         pix3 += line_size;
3342     }
3343     return s;
3344 }
3345
3346 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3347     MpegEncContext *c = v;
3348     int score1=0;
3349     int score2=0;
3350     int x,y;
3351
3352     for(y=0; y<h; y++){
3353         for(x=0; x<16; x++){
3354             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3355         }
3356         if(y+1<h){
3357             for(x=0; x<15; x++){
3358                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3359                              - s1[x+1] + s1[x+1+stride])
3360                         -FFABS(  s2[x  ] - s2[x  +stride]
3361                              - s2[x+1] + s2[x+1+stride]);
3362             }
3363         }
3364         s1+= stride;
3365         s2+= stride;
3366     }
3367
3368     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3369     else  return score1 + FFABS(score2)*8;
3370 }
3371
3372 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3373     MpegEncContext *c = v;
3374     int score1=0;
3375     int score2=0;
3376     int x,y;
3377
3378     for(y=0; y<h; y++){
3379         for(x=0; x<8; x++){
3380             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3381         }
3382         if(y+1<h){
3383             for(x=0; x<7; x++){
3384                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3385                              - s1[x+1] + s1[x+1+stride])
3386                         -FFABS(  s2[x  ] - s2[x  +stride]
3387                              - s2[x+1] + s2[x+1+stride]);
3388             }
3389         }
3390         s1+= stride;
3391         s2+= stride;
3392     }
3393
3394     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3395     else  return score1 + FFABS(score2)*8;
3396 }
3397
3398 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3399     int i;
3400     unsigned int sum=0;
3401
3402     for(i=0; i<8*8; i++){
3403         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3404         int w= weight[i];
3405         b>>= RECON_SHIFT;
3406         assert(-512<b && b<512);
3407
3408         sum += (w*b)*(w*b)>>4;
3409     }
3410     return sum>>2;
3411 }
3412
3413 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3414     int i;
3415
3416     for(i=0; i<8*8; i++){
3417         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3418     }
3419 }
3420
3421 /**
3422  * permutes an 8x8 block.
3423  * @param block the block which will be permuted according to the given permutation vector
3424  * @param permutation the permutation vector
3425  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3426  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3427  *                  (inverse) permutated to scantable order!
3428  */
3429 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3430 {
3431     int i;
3432     DCTELEM temp[64];
3433
3434     if(last<=0) return;
3435     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3436
3437     for(i=0; i<=last; i++){
3438         const int j= scantable[i];
3439         temp[j]= block[j];
3440         block[j]=0;
3441     }
3442
3443     for(i=0; i<=last; i++){
3444         const int j= scantable[i];
3445         const int perm_j= permutation[j];
3446         block[perm_j]= temp[j];
3447     }
3448 }
3449
3450 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3451     return 0;
3452 }
3453
3454 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3455     int i;
3456
3457     memset(cmp, 0, sizeof(void*)*6);
3458
3459     for(i=0; i<6; i++){
3460         switch(type&0xFF){
3461         case FF_CMP_SAD:
3462             cmp[i]= c->sad[i];
3463             break;
3464         case FF_CMP_SATD:
3465             cmp[i]= c->hadamard8_diff[i];
3466             break;
3467         case FF_CMP_SSE:
3468             cmp[i]= c->sse[i];
3469             break;
3470         case FF_CMP_DCT:
3471             cmp[i]= c->dct_sad[i];
3472             break;
3473         case FF_CMP_DCT264:
3474             cmp[i]= c->dct264_sad[i];
3475             break;
3476         case FF_CMP_DCTMAX:
3477             cmp[i]= c->dct_max[i];
3478             break;
3479         case FF_CMP_PSNR:
3480             cmp[i]= c->quant_psnr[i];
3481             break;
3482         case FF_CMP_BIT:
3483             cmp[i]= c->bit[i];
3484             break;
3485         case FF_CMP_RD:
3486             cmp[i]= c->rd[i];
3487             break;
3488         case FF_CMP_VSAD:
3489             cmp[i]= c->vsad[i];
3490             break;
3491         case FF_CMP_VSSE:
3492             cmp[i]= c->vsse[i];
3493             break;
3494         case FF_CMP_ZERO:
3495             cmp[i]= zero_cmp;
3496             break;
3497         case FF_CMP_NSSE:
3498             cmp[i]= c->nsse[i];
3499             break;
3500 #if CONFIG_SNOW_ENCODER
3501         case FF_CMP_W53:
3502             cmp[i]= c->w53[i];
3503             break;
3504         case FF_CMP_W97:
3505             cmp[i]= c->w97[i];
3506             break;
3507 #endif
3508         default:
3509             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3510         }
3511     }
3512 }
3513
3514 static void clear_block_c(DCTELEM *block)
3515 {
3516     memset(block, 0, sizeof(DCTELEM)*64);
3517 }
3518
3519 /**
3520  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3521  */
3522 static void clear_blocks_c(DCTELEM *blocks)
3523 {
3524     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3525 }
3526
3527 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3528     long i;
3529     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3530         long a = *(long*)(src+i);
3531         long b = *(long*)(dst+i);
3532         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3533     }
3534     for(; i<w; i++)
3535         dst[i+0] += src[i+0];
3536 }
3537
3538 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3539     long i;
3540     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3541         long a = *(long*)(src1+i);
3542         long b = *(long*)(src2+i);
3543         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3544     }
3545     for(; i<w; i++)
3546         dst[i] = src1[i]+src2[i];
3547 }
3548
3549 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3550     long i;
3551 #if !HAVE_FAST_UNALIGNED
3552     if((long)src2 & (sizeof(long)-1)){
3553         for(i=0; i+7<w; i+=8){
3554             dst[i+0] = src1[i+0]-src2[i+0];
3555             dst[i+1] = src1[i+1]-src2[i+1];
3556             dst[i+2] = src1[i+2]-src2[i+2];
3557             dst[i+3] = src1[i+3]-src2[i+3];
3558             dst[i+4] = src1[i+4]-src2[i+4];
3559             dst[i+5] = src1[i+5]-src2[i+5];
3560             dst[i+6] = src1[i+6]-src2[i+6];
3561             dst[i+7] = src1[i+7]-src2[i+7];
3562         }
3563     }else
3564 #endif
3565     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3566         long a = *(long*)(src1+i);
3567         long b = *(long*)(src2+i);
3568         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3569     }
3570     for(; i<w; i++)
3571         dst[i+0] = src1[i+0]-src2[i+0];
3572 }
3573
3574 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3575     int i;
3576     uint8_t l, lt;
3577
3578     l= *left;
3579     lt= *left_top;
3580
3581     for(i=0; i<w; i++){
3582         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3583         lt= src1[i];
3584         dst[i]= l;
3585     }
3586
3587     *left= l;
3588     *left_top= lt;
3589 }
3590
3591 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3592     int i;
3593     uint8_t l, lt;
3594
3595     l= *left;
3596     lt= *left_top;
3597
3598     for(i=0; i<w; i++){
3599         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3600         lt= src1[i];
3601         l= src2[i];
3602         dst[i]= l - pred;
3603     }
3604
3605     *left= l;
3606     *left_top= lt;
3607 }
3608
3609 #define BUTTERFLY2(o1,o2,i1,i2) \
3610 o1= (i1)+(i2);\
3611 o2= (i1)-(i2);
3612
3613 #define BUTTERFLY1(x,y) \
3614 {\
3615     int a,b;\
3616     a= x;\
3617     b= y;\
3618     x= a+b;\
3619     y= a-b;\
3620 }
3621
3622 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3623
3624 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3625     int i;
3626     int temp[64];
3627     int sum=0;
3628
3629     assert(h==8);
3630
3631     for(i=0; i<8; i++){
3632         //FIXME try pointer walks
3633         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3634         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3635         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3636         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3637
3638         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3639         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3640         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3641         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3642
3643         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3644         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3645         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3646         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3647     }
3648
3649     for(i=0; i<8; i++){
3650         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3651         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3652         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3653         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3654
3655         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3656         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3657         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3658         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3659
3660         sum +=
3661              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3662             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3663             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3664             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3665     }
3666 #if 0
3667 static int maxi=0;
3668 if(sum>maxi){
3669     maxi=sum;
3670     printf("MAX:%d\n", maxi);
3671 }
3672 #endif
3673     return sum;
3674 }
3675
3676 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3677     int i;
3678     int temp[64];
3679     int sum=0;
3680
3681     assert(h==8);
3682
3683     for(i=0; i<8; i++){
3684         //FIXME try pointer walks
3685         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3686         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3687         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3688         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3689
3690         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3691         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3692         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3693         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3694
3695         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3696         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3697         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3698         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3699     }
3700
3701     for(i=0; i<8; i++){
3702         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3703         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3704         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3705         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3706
3707         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3708         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3709         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3710         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3711
3712         sum +=
3713              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3714             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3715             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3716             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3717     }
3718
3719     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3720
3721     return sum;
3722 }
3723
3724 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3725     MpegEncContext * const s= (MpegEncContext *)c;
3726     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3727     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3728
3729     assert(h==8);
3730
3731     s->dsp.diff_pixels(temp, src1, src2, stride);
3732     s->dsp.fdct(temp);
3733     return s->dsp.sum_abs_dctelem(temp);
3734 }
3735
3736 #if CONFIG_GPL
3737 #define DCT8_1D {\
3738     const int s07 = SRC(0) + SRC(7);\
3739     const int s16 = SRC(1) + SRC(6);\
3740     const int s25 = SRC(2) + SRC(5);\
3741     const int s34 = SRC(3) + SRC(4);\
3742     const int a0 = s07 + s34;\
3743     const int a1 = s16 + s25;\
3744     const int a2 = s07 - s34;\
3745     const int a3 = s16 - s25;\
3746     const int d07 = SRC(0) - SRC(7);\
3747     const int d16 = SRC(1) - SRC(6);\
3748     const int d25 = SRC(2) - SRC(5);\
3749     const int d34 = SRC(3) - SRC(4);\
3750     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3751     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3752     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3753     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3754     DST(0,  a0 + a1     ) ;\
3755     DST(1,  a4 + (a7>>2)) ;\
3756     DST(2,  a2 + (a3>>1)) ;\
3757     DST(3,  a5 + (a6>>2)) ;\
3758     DST(4,  a0 - a1     ) ;\
3759     DST(5,  a6 - (a5>>2)) ;\
3760     DST(6, (a2>>1) - a3 ) ;\
3761     DST(7, (a4>>2) - a7 ) ;\
3762 }
3763
3764 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3765     MpegEncContext * const s= (MpegEncContext *)c;
3766     DCTELEM dct[8][8];
3767     int i;
3768     int sum=0;
3769
3770     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3771
3772 #define SRC(x) dct[i][x]
3773 #define DST(x,v) dct[i][x]= v
3774     for( i = 0; i < 8; i++ )
3775         DCT8_1D
3776 #undef SRC
3777 #undef DST
3778
3779 #define SRC(x) dct[x][i]
3780 #define DST(x,v) sum += FFABS(v)
3781     for( i = 0; i < 8; i++ )
3782         DCT8_1D
3783 #undef SRC
3784 #undef DST
3785     return sum;
3786 }
3787 #endif
3788
3789 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790     MpegEncContext * const s= (MpegEncContext *)c;
3791     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3792     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3793     int sum=0, i;
3794
3795     assert(h==8);
3796
3797     s->dsp.diff_pixels(temp, src1, src2, stride);
3798     s->dsp.fdct(temp);
3799
3800     for(i=0; i<64; i++)
3801         sum= FFMAX(sum, FFABS(temp[i]));
3802
3803     return sum;
3804 }
3805
3806 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3807     MpegEncContext * const s= (MpegEncContext *)c;
3808     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3809     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3810     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3811     int sum=0, i;
3812
3813     assert(h==8);
3814     s->mb_intra=0;
3815
3816     s->dsp.diff_pixels(temp, src1, src2, stride);
3817
3818     memcpy(bak, temp, 64*sizeof(DCTELEM));
3819
3820     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3821     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3822     ff_simple_idct(temp); //FIXME
3823
3824     for(i=0; i<64; i++)
3825         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3826
3827     return sum;
3828 }
3829
3830 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3831     MpegEncContext * const s= (MpegEncContext *)c;
3832     const uint8_t *scantable= s->intra_scantable.permutated;
3833     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3834     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3835     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3836     uint8_t * const bak= (uint8_t*)aligned_bak;
3837     int i, last, run, bits, level, distortion, start_i;
3838     const int esc_length= s->ac_esc_length;
3839     uint8_t * length;
3840     uint8_t * last_length;
3841
3842     assert(h==8);
3843
3844     for(i=0; i<8; i++){
3845         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3846         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3847     }
3848
3849     s->dsp.diff_pixels(temp, src1, src2, stride);
3850
3851     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3852
3853     bits=0;
3854
3855     if (s->mb_intra) {
3856         start_i = 1;
3857         length     = s->intra_ac_vlc_length;
3858         last_length= s->intra_ac_vlc_last_length;
3859         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3860     } else {
3861         start_i = 0;
3862         length     = s->inter_ac_vlc_length;
3863         last_length= s->inter_ac_vlc_last_length;
3864     }
3865
3866     if(last>=start_i){
3867         run=0;
3868         for(i=start_i; i<last; i++){
3869             int j= scantable[i];
3870             level= temp[j];
3871
3872             if(level){
3873                 level+=64;
3874                 if((level&(~127)) == 0){
3875                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3876                 }else
3877                     bits+= esc_length;
3878                 run=0;
3879             }else
3880                 run++;
3881         }
3882         i= scantable[last];
3883
3884         level= temp[i] + 64;
3885
3886         assert(level - 64);
3887
3888         if((level&(~127)) == 0){
3889             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3890         }else
3891             bits+= esc_length;
3892
3893     }
3894
3895     if(last>=0){
3896         if(s->mb_intra)
3897             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3898         else
3899             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3900     }
3901
3902     s->dsp.idct_add(bak, stride, temp);
3903
3904     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3905
3906     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3907 }
3908
3909 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3910     MpegEncContext * const s= (MpegEncContext *)c;
3911     const uint8_t *scantable= s->intra_scantable.permutated;
3912     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3913     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3914     int i, last, run, bits, level, start_i;
3915     const int esc_length= s->ac_esc_length;
3916     uint8_t * length;
3917     uint8_t * last_length;
3918
3919     assert(h==8);
3920
3921     s->dsp.diff_pixels(temp, src1, src2, stride);
3922
3923     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3924
3925     bits=0;
3926
3927     if (s->mb_intra) {
3928         start_i = 1;
3929         length     = s->intra_ac_vlc_length;
3930         last_length= s->intra_ac_vlc_last_length;
3931         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3932     } else {
3933         start_i = 0;
3934         length     = s->inter_ac_vlc_length;
3935         last_length= s->inter_ac_vlc_last_length;
3936     }
3937
3938     if(last>=start_i){
3939         run=0;
3940         for(i=start_i; i<last; i++){
3941             int j= scantable[i];
3942             level= temp[j];
3943
3944             if(level){
3945                 level+=64;
3946                 if((level&(~127)) == 0){
3947                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3948                 }else
3949                     bits+= esc_length;
3950                 run=0;
3951             }else
3952                 run++;
3953         }
3954         i= scantable[last];
3955
3956         level= temp[i] + 64;
3957
3958         assert(level - 64);
3959
3960         if((level&(~127)) == 0){
3961             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3962         }else
3963             bits+= esc_length;
3964     }
3965
3966     return bits;
3967 }
3968
3969 #define VSAD_INTRA(size) \
3970 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3971     int score=0;                                                                                            \
3972     int x,y;                                                                                                \
3973                                                                                                             \
3974     for(y=1; y<h; y++){                                                                                     \
3975         for(x=0; x<size; x+=4){                                                                             \
3976             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3977                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3978         }                                                                                                   \
3979         s+= stride;                                                                                         \
3980     }                                                                                                       \
3981                                                                                                             \
3982     return score;                                                                                           \
3983 }
3984 VSAD_INTRA(8)
3985 VSAD_INTRA(16)
3986
3987 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3988     int score=0;
3989     int x,y;
3990
3991     for(y=1; y<h; y++){
3992         for(x=0; x<16; x++){
3993             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3994         }
3995         s1+= stride;
3996         s2+= stride;
3997     }
3998
3999     return score;
4000 }
4001
4002 #define SQ(a) ((a)*(a))
4003 #define VSSE_INTRA(size) \
4004 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4005     int score=0;                                                                                            \
4006     int x,y;                                                                                                \
4007                                                                                                             \
4008     for(y=1; y<h; y++){                                                                                     \
4009         for(x=0; x<size; x+=4){                                                                               \
4010             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
4011                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
4012         }                                                                                                   \
4013         s+= stride;                                                                                         \
4014     }                                                                                                       \
4015                                                                                                             \
4016     return score;                                                                                           \
4017 }
4018 VSSE_INTRA(8)
4019 VSSE_INTRA(16)
4020
4021 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4022     int score=0;
4023     int x,y;
4024
4025     for(y=1; y<h; y++){
4026         for(x=0; x<16; x++){
4027             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
4028         }
4029         s1+= stride;
4030         s2+= stride;
4031     }
4032
4033     return score;
4034 }
4035
4036 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4037                                int size){
4038     int score=0;
4039     int i;
4040     for(i=0; i<size; i++)
4041         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4042     return score;
4043 }
4044
4045 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4046 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4047 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4048 #if CONFIG_GPL
4049 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4050 #endif
4051 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4052 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4053 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4054 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4055
4056 static void vector_fmul_c(float *dst, const float *src, int len){
4057     int i;
4058     for(i=0; i<len; i++)
4059         dst[i] *= src[i];
4060 }
4061
4062 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4063     int i;
4064     src1 += len-1;
4065     for(i=0; i<len; i++)
4066         dst[i] = src0[i] * src1[-i];
4067 }
4068
4069 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4070     int i;
4071     for(i=0; i<len; i++)
4072         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4073 }
4074
4075 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4076     int i,j;
4077     dst += len;
4078     win += len;
4079     src0+= len;
4080     for(i=-len, j=len-1; i<0; i++, j--) {
4081         float s0 = src0[i];
4082         float s1 = src1[j];
4083         float wi = win[i];
4084         float wj = win[j];
4085         dst[i] = s0*wj - s1*wi + add_bias;
4086         dst[j] = s0*wi + s1*wj + add_bias;
4087     }
4088 }
4089
4090 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4091     int i;
4092     for(i=0; i<len; i++)
4093         dst[i] = src[i] * mul;
4094 }
4095
4096 static av_always_inline int float_to_int16_one(const float *src){
4097     int_fast32_t tmp = *(const int32_t*)src;
4098     if(tmp & 0xf0000){
4099         tmp = (0x43c0ffff - tmp)>>31;
4100         // is this faster on some gcc/cpu combinations?
4101 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4102 //      else                 tmp = 0;
4103     }
4104     return tmp - 0x8000;
4105 }
4106
4107 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4108     int i;
4109     for(i=0; i<len; i++)
4110         dst[i] = float_to_int16_one(src+i);
4111 }
4112
4113 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4114     int i,j,c;
4115     if(channels==2){
4116         for(i=0; i<len; i++){
4117             dst[2*i]   = float_to_int16_one(src[0]+i);
4118             dst[2*i+1] = float_to_int16_one(src[1]+i);
4119         }
4120     }else{
4121         for(c=0; c<channels; c++)
4122             for(i=0, j=c; i<len; i++, j+=channels)
4123                 dst[j] = float_to_int16_one(src[c]+i);
4124     }
4125 }
4126
4127 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4128 {
4129     while (order--)
4130        *v1++ += *v2++;
4131 }
4132
4133 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4134 {
4135     while (order--)
4136         *v1++ -= *v2++;
4137 }
4138
4139 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4140 {
4141     int res = 0;
4142
4143     while (order--)
4144         res += (*v1++ * *v2++) >> shift;
4145
4146     return res;
4147 }
4148
4149 #define W0 2048
4150 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4151 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4152 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4153 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4154 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4155 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4156 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4157
4158 static void wmv2_idct_row(short * b)
4159 {
4160     int s1,s2;
4161     int a0,a1,a2,a3,a4,a5,a6,a7;
4162     /*step 1*/
4163     a1 = W1*b[1]+W7*b[7];
4164     a7 = W7*b[1]-W1*b[7];
4165     a5 = W5*b[5]+W3*b[3];
4166     a3 = W3*b[5]-W5*b[3];
4167     a2 = W2*b[2]+W6*b[6];
4168     a6 = W6*b[2]-W2*b[6];
4169     a0 = W0*b[0]+W0*b[4];
4170     a4 = W0*b[0]-W0*b[4];
4171     /*step 2*/
4172     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4173     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4174     /*step 3*/
4175     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4176     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4177     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4178     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4179     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4180     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4181     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4182     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4183 }
4184 static void wmv2_idct_col(short * b)
4185 {
4186     int s1,s2;
4187     int a0,a1,a2,a3,a4,a5,a6,a7;
4188     /*step 1, with extended precision*/
4189     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4190     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4191     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4192     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4193     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4194     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4195     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4196     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4197     /*step 2*/
4198     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4199     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4200     /*step 3*/
4201     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4202     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4203     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4204     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4205
4206     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4207     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4208     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4209     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4210 }
4211 void ff_wmv2_idct_c(short * block){
4212     int i;
4213
4214     for(i=0;i<64;i+=8){
4215         wmv2_idct_row(block+i);
4216     }
4217     for(i=0;i<8;i++){
4218         wmv2_idct_col(block+i);
4219     }
4220 }
4221 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4222  converted */
4223 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4224 {
4225     ff_wmv2_idct_c(block);
4226     put_pixels_clamped_c(block, dest, line_size);
4227 }
4228 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4229 {
4230     ff_wmv2_idct_c(block);
4231     add_pixels_clamped_c(block, dest, line_size);
4232 }
4233 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4234 {
4235     j_rev_dct (block);
4236     put_pixels_clamped_c(block, dest, line_size);
4237 }
4238 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4239 {
4240     j_rev_dct (block);
4241     add_pixels_clamped_c(block, dest, line_size);
4242 }
4243
4244 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4245 {
4246     j_rev_dct4 (block);
4247     put_pixels_clamped4_c(block, dest, line_size);
4248 }
4249 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4250 {
4251     j_rev_dct4 (block);
4252     add_pixels_clamped4_c(block, dest, line_size);
4253 }
4254
4255 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4256 {
4257     j_rev_dct2 (block);
4258     put_pixels_clamped2_c(block, dest, line_size);
4259 }
4260 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4261 {
4262     j_rev_dct2 (block);
4263     add_pixels_clamped2_c(block, dest, line_size);
4264 }
4265
4266 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4267 {
4268     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4269
4270     dest[0] = cm[(block[0] + 4)>>3];
4271 }
4272 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4273 {
4274     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4275
4276     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4277 }
4278
4279 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4280
4281 /* init static data */
4282 void dsputil_static_init(void)
4283 {
4284     int i;
4285
4286     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4287     for(i=0;i<MAX_NEG_CROP;i++) {
4288         ff_cropTbl[i] = 0;
4289         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4290     }
4291
4292     for(i=0;i<512;i++) {
4293         ff_squareTbl[i] = (i - 256) * (i - 256);
4294     }
4295
4296     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4297 }
4298
4299 int ff_check_alignment(void){
4300     static int did_fail=0;
4301     DECLARE_ALIGNED_16(int, aligned);
4302
4303     if((intptr_t)&aligned & 15){
4304         if(!did_fail){
4305 #if HAVE_MMX || HAVE_ALTIVEC
4306             av_log(NULL, AV_LOG_ERROR,
4307                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4308                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4309                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4310                 "Do not report crashes to FFmpeg developers.\n");
4311 #endif
4312             did_fail=1;
4313         }
4314         return -1;
4315     }
4316     return 0;
4317 }
4318
4319 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4320 {
4321     int i;
4322
4323     ff_check_alignment();
4324
4325 #if CONFIG_ENCODERS
4326     if(avctx->dct_algo==FF_DCT_FASTINT) {
4327         c->fdct = fdct_ifast;
4328         c->fdct248 = fdct_ifast248;
4329     }
4330     else if(avctx->dct_algo==FF_DCT_FAAN) {
4331         c->fdct = ff_faandct;
4332         c->fdct248 = ff_faandct248;
4333     }
4334     else {
4335         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4336         c->fdct248 = ff_fdct248_islow;
4337     }
4338 #endif //CONFIG_ENCODERS
4339
4340     if(avctx->lowres==1){
4341         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4342             c->idct_put= ff_jref_idct4_put;
4343             c->idct_add= ff_jref_idct4_add;
4344         }else{
4345             c->idct_put= ff_h264_lowres_idct_put_c;
4346             c->idct_add= ff_h264_lowres_idct_add_c;
4347         }
4348         c->idct    = j_rev_dct4;
4349         c->idct_permutation_type= FF_NO_IDCT_PERM;
4350     }else if(avctx->lowres==2){
4351         c->idct_put= ff_jref_idct2_put;
4352         c->idct_add= ff_jref_idct2_add;
4353         c->idct    = j_rev_dct2;
4354         c->idct_permutation_type= FF_NO_IDCT_PERM;
4355     }else if(avctx->lowres==3){
4356         c->idct_put= ff_jref_idct1_put;
4357         c->idct_add= ff_jref_idct1_add;
4358         c->idct    = j_rev_dct1;
4359         c->idct_permutation_type= FF_NO_IDCT_PERM;
4360     }else{
4361         if(avctx->idct_algo==FF_IDCT_INT){
4362             c->idct_put= ff_jref_idct_put;
4363             c->idct_add= ff_jref_idct_add;
4364             c->idct    = j_rev_dct;
4365             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4366         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4367                 avctx->idct_algo==FF_IDCT_VP3){
4368             c->idct_put= ff_vp3_idct_put_c;
4369             c->idct_add= ff_vp3_idct_add_c;
4370             c->idct    = ff_vp3_idct_c;
4371             c->idct_permutation_type= FF_NO_IDCT_PERM;
4372         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4373             c->idct_put= ff_wmv2_idct_put_c;
4374             c->idct_add= ff_wmv2_idct_add_c;
4375             c->idct    = ff_wmv2_idct_c;
4376             c->idct_permutation_type= FF_NO_IDCT_PERM;
4377         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4378             c->idct_put= ff_faanidct_put;
4379             c->idct_add= ff_faanidct_add;
4380             c->idct    = ff_faanidct;
4381             c->idct_permutation_type= FF_NO_IDCT_PERM;
4382         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4383             c->idct_put= ff_ea_idct_put_c;
4384             c->idct_permutation_type= FF_NO_IDCT_PERM;
4385         }else{ //accurate/default
4386             c->idct_put= ff_simple_idct_put;
4387             c->idct_add= ff_simple_idct_add;
4388             c->idct    = ff_simple_idct;
4389             c->idct_permutation_type= FF_NO_IDCT_PERM;
4390         }
4391     }
4392
4393     if (CONFIG_H264_DECODER) {
4394         c->h264_idct_add= ff_h264_idct_add_c;
4395         c->h264_idct8_add= ff_h264_idct8_add_c;
4396         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4397         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4398         c->h264_idct_add16     = ff_h264_idct_add16_c;
4399         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4400         c->h264_idct_add8      = ff_h264_idct_add8_c;
4401         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4402     }
4403
4404     c->get_pixels = get_pixels_c;
4405     c->diff_pixels = diff_pixels_c;
4406     c->put_pixels_clamped = put_pixels_clamped_c;
4407     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4408     c->add_pixels_clamped = add_pixels_clamped_c;
4409     c->add_pixels8 = add_pixels8_c;
4410     c->add_pixels4 = add_pixels4_c;
4411     c->sum_abs_dctelem = sum_abs_dctelem_c;
4412     c->gmc1 = gmc1_c;
4413     c->gmc = ff_gmc_c;
4414     c->clear_block = clear_block_c;
4415     c->clear_blocks = clear_blocks_c;
4416     c->pix_sum = pix_sum_c;
4417     c->pix_norm1 = pix_norm1_c;
4418
4419     /* TODO [0] 16  [1] 8 */
4420     c->pix_abs[0][0] = pix_abs16_c;
4421     c->pix_abs[0][1] = pix_abs16_x2_c;
4422     c->pix_abs[0][2] = pix_abs16_y2_c;
4423     c->pix_abs[0][3] = pix_abs16_xy2_c;
4424     c->pix_abs[1][0] = pix_abs8_c;
4425     c->pix_abs[1][1] = pix_abs8_x2_c;
4426     c->pix_abs[1][2] = pix_abs8_y2_c;
4427     c->pix_abs[1][3] = pix_abs8_xy2_c;
4428
4429 #define dspfunc(PFX, IDX, NUM) \
4430     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4431     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4432     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4433     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4434
4435     dspfunc(put, 0, 16);
4436     dspfunc(put_no_rnd, 0, 16);
4437     dspfunc(put, 1, 8);
4438     dspfunc(put_no_rnd, 1, 8);
4439     dspfunc(put, 2, 4);
4440     dspfunc(put, 3, 2);
4441
4442     dspfunc(avg, 0, 16);
4443     dspfunc(avg_no_rnd, 0, 16);
4444     dspfunc(avg, 1, 8);
4445     dspfunc(avg_no_rnd, 1, 8);
4446     dspfunc(avg, 2, 4);
4447     dspfunc(avg, 3, 2);
4448 #undef dspfunc
4449
4450     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4451     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4452
4453     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4454     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4455     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4456     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4457     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4458     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4459     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4460     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4461     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4462
4463     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4464     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4465     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4466     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4467     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4468     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4469     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4470     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4471     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4472
4473 #define dspfunc(PFX, IDX, NUM) \
4474     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4475     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4476     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4477     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4478     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4479     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4480     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4481     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4482     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4483     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4484     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4485     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4486     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4487     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4488     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4489     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4490
4491     dspfunc(put_qpel, 0, 16);
4492     dspfunc(put_no_rnd_qpel, 0, 16);
4493
4494     dspfunc(avg_qpel, 0, 16);
4495     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4496
4497     dspfunc(put_qpel, 1, 8);
4498     dspfunc(put_no_rnd_qpel, 1, 8);
4499
4500     dspfunc(avg_qpel, 1, 8);
4501     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4502
4503     dspfunc(put_h264_qpel, 0, 16);
4504     dspfunc(put_h264_qpel, 1, 8);
4505     dspfunc(put_h264_qpel, 2, 4);
4506     dspfunc(put_h264_qpel, 3, 2);
4507     dspfunc(avg_h264_qpel, 0, 16);
4508     dspfunc(avg_h264_qpel, 1, 8);
4509     dspfunc(avg_h264_qpel, 2, 4);
4510
4511 #undef dspfunc
4512     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4513     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4514     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4515     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4516     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4517     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4518     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4519     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4520
4521     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4522     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4523     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4524     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4525     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4526     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4527     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4528     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4529     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4530     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4531     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4532     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4533     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4534     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4535     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4536     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4537     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4538     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4539     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4540     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4541
4542     c->draw_edges = draw_edges_c;
4543
4544 #if CONFIG_CAVS_DECODER
4545     ff_cavsdsp_init(c,avctx);
4546 #endif
4547
4548 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4549     ff_mlp_init(c, avctx);
4550 #endif
4551 #if CONFIG_VC1_DECODER
4552     ff_vc1dsp_init(c,avctx);
4553 #endif
4554 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4555     ff_intrax8dsp_init(c,avctx);
4556 #endif
4557 #if CONFIG_RV30_DECODER
4558     ff_rv30dsp_init(c,avctx);
4559 #endif
4560 #if CONFIG_RV40_DECODER
4561     ff_rv40dsp_init(c,avctx);
4562     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4563     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4564     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4565     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4566 #endif
4567
4568     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4569     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4570     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4571     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4572     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4573     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4574     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4575     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4576
4577 #define SET_CMP_FUNC(name) \
4578     c->name[0]= name ## 16_c;\
4579     c->name[1]= name ## 8x8_c;
4580
4581     SET_CMP_FUNC(hadamard8_diff)
4582     c->hadamard8_diff[4]= hadamard8_intra16_c;
4583     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4584     SET_CMP_FUNC(dct_sad)
4585     SET_CMP_FUNC(dct_max)
4586 #if CONFIG_GPL
4587     SET_CMP_FUNC(dct264_sad)
4588 #endif
4589     c->sad[0]= pix_abs16_c;
4590     c->sad[1]= pix_abs8_c;
4591     c->sse[0]= sse16_c;
4592     c->sse[1]= sse8_c;
4593     c->sse[2]= sse4_c;
4594     SET_CMP_FUNC(quant_psnr)
4595     SET_CMP_FUNC(rd)
4596     SET_CMP_FUNC(bit)
4597     c->vsad[0]= vsad16_c;
4598     c->vsad[4]= vsad_intra16_c;
4599     c->vsad[5]= vsad_intra8_c;
4600     c->vsse[0]= vsse16_c;
4601     c->vsse[4]= vsse_intra16_c;
4602     c->vsse[5]= vsse_intra8_c;
4603     c->nsse[0]= nsse16_c;
4604     c->nsse[1]= nsse8_c;
4605 #if CONFIG_SNOW_ENCODER
4606     c->w53[0]= w53_16_c;
4607     c->w53[1]= w53_8_c;
4608     c->w97[0]= w97_16_c;
4609     c->w97[1]= w97_8_c;
4610 #endif
4611
4612     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4613
4614     c->add_bytes= add_bytes_c;
4615     c->add_bytes_l2= add_bytes_l2_c;
4616     c->diff_bytes= diff_bytes_c;
4617     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4618     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4619     c->bswap_buf= bswap_buf;
4620 #if CONFIG_PNG_DECODER
4621     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4622 #endif
4623
4624     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4625     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4626     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4627     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4628     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4629     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4630     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4631     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4632     c->h264_loop_filter_strength= NULL;
4633
4634     if (CONFIG_ANY_H263) {
4635         c->h263_h_loop_filter= h263_h_loop_filter_c;
4636         c->h263_v_loop_filter= h263_v_loop_filter_c;
4637     }
4638
4639     if (CONFIG_VP3_DECODER) {
4640         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4641         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4642     }
4643     if (CONFIG_VP6_DECODER) {
4644         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4645     }
4646
4647     c->h261_loop_filter= h261_loop_filter_c;
4648
4649     c->try_8x8basis= try_8x8basis_c;
4650     c->add_8x8basis= add_8x8basis_c;
4651
4652 #if CONFIG_SNOW_DECODER
4653     c->vertical_compose97i = ff_snow_vertical_compose97i;
4654     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4655     c->inner_add_yblock = ff_snow_inner_add_yblock;
4656 #endif
4657
4658 #if CONFIG_VORBIS_DECODER
4659     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4660 #endif
4661 #if CONFIG_AC3_DECODER
4662     c->ac3_downmix = ff_ac3_downmix_c;
4663 #endif
4664 #if CONFIG_FLAC_ENCODER
4665     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4666 #endif
4667     c->vector_fmul = vector_fmul_c;
4668     c->vector_fmul_reverse = vector_fmul_reverse_c;
4669     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4670     c->vector_fmul_window = ff_vector_fmul_window_c;
4671     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4672     c->float_to_int16 = ff_float_to_int16_c;
4673     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4674     c->add_int16 = add_int16_c;
4675     c->sub_int16 = sub_int16_c;
4676     c->scalarproduct_int16 = scalarproduct_int16_c;
4677
4678     c->shrink[0]= ff_img_copy_plane;
4679     c->shrink[1]= ff_shrink22;
4680     c->shrink[2]= ff_shrink44;
4681     c->shrink[3]= ff_shrink88;
4682
4683     c->prefetch= just_return;
4684
4685     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4686     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4687
4688     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4689     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4690     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4691     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4692     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4693     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4694     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4695     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4696     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4697
4698     for(i=0; i<64; i++){
4699         if(!c->put_2tap_qpel_pixels_tab[0][i])
4700             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4701         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4702             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4703     }
4704
4705     switch(c->idct_permutation_type){
4706     case FF_NO_IDCT_PERM:
4707         for(i=0; i<64; i++)
4708             c->idct_permutation[i]= i;
4709         break;
4710     case FF_LIBMPEG2_IDCT_PERM:
4711         for(i=0; i<64; i++)
4712             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4713         break;
4714     case FF_SIMPLE_IDCT_PERM:
4715         for(i=0; i<64; i++)
4716             c->idct_permutation[i]= simple_mmx_permutation[i];
4717         break;
4718     case FF_TRANSPOSE_IDCT_PERM:
4719         for(i=0; i<64; i++)
4720             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4721         break;
4722     case FF_PARTTRANS_IDCT_PERM:
4723         for(i=0; i<64; i++)
4724             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4725         break;
4726     case FF_SSE2_IDCT_PERM:
4727         for(i=0; i<64; i++)
4728             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4729         break;
4730     default:
4731         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4732     }
4733 }
4734