libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "h263.h"
  37 #include "snow.h"
  38
  39 /* snow.c */
  40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
  41
  42 /* vorbis.c */
  43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
  44
  45 /* ac3dec.c */
  46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
  47
  48 /* flacenc.c */
  49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
  50
  51 /* pngdec.c */
  52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
  53
  54 /* eaidct.c */
  55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
  56
  57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  58 uint32_t ff_squareTbl[512] = {0, };
  59
  60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  61 #define pb_7f (~0UL/255 * 0x7f)
  62 #define pb_80 (~0UL/255 * 0x80)
  63
  64 const uint8_t ff_zigzag_direct[64] = {
  65     0,   1,  8, 16,  9,  2,  3, 10,
  66     17, 24, 32, 25, 18, 11,  4,  5,
  67     12, 19, 26, 33, 40, 48, 41, 34,
  68     27, 20, 13,  6,  7, 14, 21, 28,
  69     35, 42, 49, 56, 57, 50, 43, 36,
  70     29, 22, 15, 23, 30, 37, 44, 51,
  71     58, 59, 52, 45, 38, 31, 39, 46,
  72     53, 60, 61, 54, 47, 55, 62, 63
  73 };
  74
  75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  76    specification, we interleave the fields */
  77 const uint8_t ff_zigzag248_direct[64] = {
  78      0,  8,  1,  9, 16, 24,  2, 10,
  79     17, 25, 32, 40, 48, 56, 33, 41,
  80     18, 26,  3, 11,  4, 12, 19, 27,
  81     34, 42, 49, 57, 50, 58, 35, 43,
  82     20, 28,  5, 13,  6, 14, 21, 29,
  83     36, 44, 51, 59, 52, 60, 37, 45,
  84     22, 30,  7, 15, 23, 31, 38, 46,
  85     53, 61, 54, 62, 39, 47, 55, 63,
  86 };
  87
  88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
  90
  91 const uint8_t ff_alternate_horizontal_scan[64] = {
  92     0,  1,   2,  3,  8,  9, 16, 17,
  93     10, 11,  4,  5,  6,  7, 15, 14,
  94     13, 12, 19, 18, 24, 25, 32, 33,
  95     26, 27, 20, 21, 22, 23, 28, 29,
  96     30, 31, 34, 35, 40, 41, 48, 49,
  97     42, 43, 36, 37, 38, 39, 44, 45,
  98     46, 47, 50, 51, 56, 57, 58, 59,
  99     52, 53, 54, 55, 60, 61, 62, 63,
 100 };
 101
 102 const uint8_t ff_alternate_vertical_scan[64] = {
 103     0,  8,  16, 24,  1,  9,  2, 10,
 104     17, 25, 32, 40, 48, 56, 57, 49,
 105     41, 33, 26, 18,  3, 11,  4, 12,
 106     19, 27, 34, 42, 50, 58, 35, 43,
 107     51, 59, 20, 28,  5, 13,  6, 14,
 108     21, 29, 36, 44, 52, 60, 37, 45,
 109     53, 61, 22, 30,  7, 15, 23, 31,
 110     38, 46, 54, 62, 39, 47, 55, 63,
 111 };
 112
 113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
 114 const uint32_t ff_inverse[256]={
 115          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 116  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 117  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 118  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 119  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 120  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 121   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 122   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 123   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 124   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 125   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 126   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 127   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 128   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 129   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 130   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 131   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 132   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 133   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 134   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 135   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 136   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 137   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 138   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 139   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 140   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 141   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 142   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 143   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 144   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 145   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 146   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 147 };
 148
 149 /* Input permutation for the simple_idct_mmx */
 150 static const uint8_t simple_mmx_permutation[64]={
 151         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 152         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 153         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 154         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 155         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 156         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 157         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 158         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 159 };
 160
 161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 162
 163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 164     int i;
 165     int end;
 166
 167     st->scantable= src_scantable;
 168
 169     for(i=0; i<64; i++){
 170         int j;
 171         j = src_scantable[i];
 172         st->permutated[i] = permutation[j];
 173 #if ARCH_PPC
 174         st->inverse[j] = i;
 175 #endif
 176     }
 177
 178     end=-1;
 179     for(i=0; i<64; i++){
 180         int j;
 181         j = st->permutated[i];
 182         if(j>end) end=j;
 183         st->raster_end[i]= end;
 184     }
 185 }
 186
 187 static int pix_sum_c(uint8_t * pix, int line_size)
 188 {
 189     int s, i, j;
 190
 191     s = 0;
 192     for (i = 0; i < 16; i++) {
 193         for (j = 0; j < 16; j += 8) {
 194             s += pix[0];
 195             s += pix[1];
 196             s += pix[2];
 197             s += pix[3];
 198             s += pix[4];
 199             s += pix[5];
 200             s += pix[6];
 201             s += pix[7];
 202             pix += 8;
 203         }
 204         pix += line_size - 16;
 205     }
 206     return s;
 207 }
 208
 209 static int pix_norm1_c(uint8_t * pix, int line_size)
 210 {
 211     int s, i, j;
 212     uint32_t *sq = ff_squareTbl + 256;
 213
 214     s = 0;
 215     for (i = 0; i < 16; i++) {
 216         for (j = 0; j < 16; j += 8) {
 217 #if 0
 218             s += sq[pix[0]];
 219             s += sq[pix[1]];
 220             s += sq[pix[2]];
 221             s += sq[pix[3]];
 222             s += sq[pix[4]];
 223             s += sq[pix[5]];
 224             s += sq[pix[6]];
 225             s += sq[pix[7]];
 226 #else
 227 #if LONG_MAX > 2147483647
 228             register uint64_t x=*(uint64_t*)pix;
 229             s += sq[x&0xff];
 230             s += sq[(x>>8)&0xff];
 231             s += sq[(x>>16)&0xff];
 232             s += sq[(x>>24)&0xff];
 233             s += sq[(x>>32)&0xff];
 234             s += sq[(x>>40)&0xff];
 235             s += sq[(x>>48)&0xff];
 236             s += sq[(x>>56)&0xff];
 237 #else
 238             register uint32_t x=*(uint32_t*)pix;
 239             s += sq[x&0xff];
 240             s += sq[(x>>8)&0xff];
 241             s += sq[(x>>16)&0xff];
 242             s += sq[(x>>24)&0xff];
 243             x=*(uint32_t*)(pix+4);
 244             s += sq[x&0xff];
 245             s += sq[(x>>8)&0xff];
 246             s += sq[(x>>16)&0xff];
 247             s += sq[(x>>24)&0xff];
 248 #endif
 249 #endif
 250             pix += 8;
 251         }
 252         pix += line_size - 16;
 253     }
 254     return s;
 255 }
 256
 257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 258     int i;
 259
 260     for(i=0; i+8<=w; i+=8){
 261         dst[i+0]= bswap_32(src[i+0]);
 262         dst[i+1]= bswap_32(src[i+1]);
 263         dst[i+2]= bswap_32(src[i+2]);
 264         dst[i+3]= bswap_32(src[i+3]);
 265         dst[i+4]= bswap_32(src[i+4]);
 266         dst[i+5]= bswap_32(src[i+5]);
 267         dst[i+6]= bswap_32(src[i+6]);
 268         dst[i+7]= bswap_32(src[i+7]);
 269     }
 270     for(;i<w; i++){
 271         dst[i+0]= bswap_32(src[i+0]);
 272     }
 273 }
 274
 275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 276 {
 277     int s, i;
 278     uint32_t *sq = ff_squareTbl + 256;
 279
 280     s = 0;
 281     for (i = 0; i < h; i++) {
 282         s += sq[pix1[0] - pix2[0]];
 283         s += sq[pix1[1] - pix2[1]];
 284         s += sq[pix1[2] - pix2[2]];
 285         s += sq[pix1[3] - pix2[3]];
 286         pix1 += line_size;
 287         pix2 += line_size;
 288     }
 289     return s;
 290 }
 291
 292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 293 {
 294     int s, i;
 295     uint32_t *sq = ff_squareTbl + 256;
 296
 297     s = 0;
 298     for (i = 0; i < h; i++) {
 299         s += sq[pix1[0] - pix2[0]];
 300         s += sq[pix1[1] - pix2[1]];
 301         s += sq[pix1[2] - pix2[2]];
 302         s += sq[pix1[3] - pix2[3]];
 303         s += sq[pix1[4] - pix2[4]];
 304         s += sq[pix1[5] - pix2[5]];
 305         s += sq[pix1[6] - pix2[6]];
 306         s += sq[pix1[7] - pix2[7]];
 307         pix1 += line_size;
 308         pix2 += line_size;
 309     }
 310     return s;
 311 }
 312
 313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 314 {
 315     int s, i;
 316     uint32_t *sq = ff_squareTbl + 256;
 317
 318     s = 0;
 319     for (i = 0; i < h; i++) {
 320         s += sq[pix1[ 0] - pix2[ 0]];
 321         s += sq[pix1[ 1] - pix2[ 1]];
 322         s += sq[pix1[ 2] - pix2[ 2]];
 323         s += sq[pix1[ 3] - pix2[ 3]];
 324         s += sq[pix1[ 4] - pix2[ 4]];
 325         s += sq[pix1[ 5] - pix2[ 5]];
 326         s += sq[pix1[ 6] - pix2[ 6]];
 327         s += sq[pix1[ 7] - pix2[ 7]];
 328         s += sq[pix1[ 8] - pix2[ 8]];
 329         s += sq[pix1[ 9] - pix2[ 9]];
 330         s += sq[pix1[10] - pix2[10]];
 331         s += sq[pix1[11] - pix2[11]];
 332         s += sq[pix1[12] - pix2[12]];
 333         s += sq[pix1[13] - pix2[13]];
 334         s += sq[pix1[14] - pix2[14]];
 335         s += sq[pix1[15] - pix2[15]];
 336
 337         pix1 += line_size;
 338         pix2 += line_size;
 339     }
 340     return s;
 341 }
 342
 343
 344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
 345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
 346     int s, i, j;
 347     const int dec_count= w==8 ? 3 : 4;
 348     int tmp[32*32];
 349     int level, ori;
 350     static const int scale[2][2][4][4]={
 351       {
 352         {
 353             // 9/7 8x8 dec=3
 354             {268, 239, 239, 213},
 355             {  0, 224, 224, 152},
 356             {  0, 135, 135, 110},
 357         },{
 358             // 9/7 16x16 or 32x32 dec=4
 359             {344, 310, 310, 280},
 360             {  0, 320, 320, 228},
 361             {  0, 175, 175, 136},
 362             {  0, 129, 129, 102},
 363         }
 364       },{
 365         {
 366             // 5/3 8x8 dec=3
 367             {275, 245, 245, 218},
 368             {  0, 230, 230, 156},
 369             {  0, 138, 138, 113},
 370         },{
 371             // 5/3 16x16 or 32x32 dec=4
 372             {352, 317, 317, 286},
 373             {  0, 328, 328, 233},
 374             {  0, 180, 180, 140},
 375             {  0, 132, 132, 105},
 376         }
 377       }
 378     };
 379
 380     for (i = 0; i < h; i++) {
 381         for (j = 0; j < w; j+=4) {
 382             tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
 383             tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
 384             tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
 385             tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
 386         }
 387         pix1 += line_size;
 388         pix2 += line_size;
 389     }
 390
 391     ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
 392
 393     s=0;
 394     assert(w==h);
 395     for(level=0; level<dec_count; level++){
 396         for(ori= level ? 1 : 0; ori<4; ori++){
 397             int size= w>>(dec_count-level);
 398             int sx= (ori&1) ? size : 0;
 399             int stride= 32<<(dec_count-level);
 400             int sy= (ori&2) ? stride>>1 : 0;
 401
 402             for(i=0; i<size; i++){
 403                 for(j=0; j<size; j++){
 404                     int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
 405                     s += FFABS(v);
 406                 }
 407             }
 408         }
 409     }
 410     assert(s>=0);
 411     return s>>9;
 412 }
 413
 414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 415     return w_c(v, pix1, pix2, line_size,  8, h, 1);
 416 }
 417
 418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 419     return w_c(v, pix1, pix2, line_size,  8, h, 0);
 420 }
 421
 422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 423     return w_c(v, pix1, pix2, line_size, 16, h, 1);
 424 }
 425
 426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 427     return w_c(v, pix1, pix2, line_size, 16, h, 0);
 428 }
 429
 430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 431     return w_c(v, pix1, pix2, line_size, 32, h, 1);
 432 }
 433
 434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
 435     return w_c(v, pix1, pix2, line_size, 32, h, 0);
 436 }
 437 #endif
 438
 439 /* draw the edges of width 'w' of an image of size width, height */
 440 //FIXME check that this is ok for mpeg4 interlaced
 441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 442 {
 443     uint8_t *ptr, *last_line;
 444     int i;
 445
 446     last_line = buf + (height - 1) * wrap;
 447     for(i=0;i<w;i++) {
 448         /* top and bottom */
 449         memcpy(buf - (i + 1) * wrap, buf, width);
 450         memcpy(last_line + (i + 1) * wrap, last_line, width);
 451     }
 452     /* left and right */
 453     ptr = buf;
 454     for(i=0;i<height;i++) {
 455         memset(ptr - w, ptr[0], w);
 456         memset(ptr + width, ptr[width-1], w);
 457         ptr += wrap;
 458     }
 459     /* corners */
 460     for(i=0;i<w;i++) {
 461         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 462         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 463         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 464         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 465     }
 466 }
 467
 468 /**
 469  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 470  * @param buf destination buffer
 471  * @param src source buffer
 472  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 473  * @param block_w width of block
 474  * @param block_h height of block
 475  * @param src_x x coordinate of the top left sample of the block in the source buffer
 476  * @param src_y y coordinate of the top left sample of the block in the source buffer
 477  * @param w width of the source buffer
 478  * @param h height of the source buffer
 479  */
 480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 481                                     int src_x, int src_y, int w, int h){
 482     int x, y;
 483     int start_y, start_x, end_y, end_x;
 484
 485     if(src_y>= h){
 486         src+= (h-1-src_y)*linesize;
 487         src_y=h-1;
 488     }else if(src_y<=-block_h){
 489         src+= (1-block_h-src_y)*linesize;
 490         src_y=1-block_h;
 491     }
 492     if(src_x>= w){
 493         src+= (w-1-src_x);
 494         src_x=w-1;
 495     }else if(src_x<=-block_w){
 496         src+= (1-block_w-src_x);
 497         src_x=1-block_w;
 498     }
 499
 500     start_y= FFMAX(0, -src_y);
 501     start_x= FFMAX(0, -src_x);
 502     end_y= FFMIN(block_h, h-src_y);
 503     end_x= FFMIN(block_w, w-src_x);
 504
 505     // copy existing part
 506     for(y=start_y; y<end_y; y++){
 507         for(x=start_x; x<end_x; x++){
 508             buf[x + y*linesize]= src[x + y*linesize];
 509         }
 510     }
 511
 512     //top
 513     for(y=0; y<start_y; y++){
 514         for(x=start_x; x<end_x; x++){
 515             buf[x + y*linesize]= buf[x + start_y*linesize];
 516         }
 517     }
 518
 519     //bottom
 520     for(y=end_y; y<block_h; y++){
 521         for(x=start_x; x<end_x; x++){
 522             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 523         }
 524     }
 525
 526     for(y=0; y<block_h; y++){
 527        //left
 528         for(x=0; x<start_x; x++){
 529             buf[x + y*linesize]= buf[start_x + y*linesize];
 530         }
 531
 532        //right
 533         for(x=end_x; x<block_w; x++){
 534             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 535         }
 536     }
 537 }
 538
 539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 540 {
 541     int i;
 542
 543     /* read the pixels */
 544     for(i=0;i<8;i++) {
 545         block[0] = pixels[0];
 546         block[1] = pixels[1];
 547         block[2] = pixels[2];
 548         block[3] = pixels[3];
 549         block[4] = pixels[4];
 550         block[5] = pixels[5];
 551         block[6] = pixels[6];
 552         block[7] = pixels[7];
 553         pixels += line_size;
 554         block += 8;
 555     }
 556 }
 557
 558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 559                           const uint8_t *s2, int stride){
 560     int i;
 561
 562     /* read the pixels */
 563     for(i=0;i<8;i++) {
 564         block[0] = s1[0] - s2[0];
 565         block[1] = s1[1] - s2[1];
 566         block[2] = s1[2] - s2[2];
 567         block[3] = s1[3] - s2[3];
 568         block[4] = s1[4] - s2[4];
 569         block[5] = s1[5] - s2[5];
 570         block[6] = s1[6] - s2[6];
 571         block[7] = s1[7] - s2[7];
 572         s1 += stride;
 573         s2 += stride;
 574         block += 8;
 575     }
 576 }
 577
 578
 579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 580                                  int line_size)
 581 {
 582     int i;
 583     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 584
 585     /* read the pixels */
 586     for(i=0;i<8;i++) {
 587         pixels[0] = cm[block[0]];
 588         pixels[1] = cm[block[1]];
 589         pixels[2] = cm[block[2]];
 590         pixels[3] = cm[block[3]];
 591         pixels[4] = cm[block[4]];
 592         pixels[5] = cm[block[5]];
 593         pixels[6] = cm[block[6]];
 594         pixels[7] = cm[block[7]];
 595
 596         pixels += line_size;
 597         block += 8;
 598     }
 599 }
 600
 601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 602                                  int line_size)
 603 {
 604     int i;
 605     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 606
 607     /* read the pixels */
 608     for(i=0;i<4;i++) {
 609         pixels[0] = cm[block[0]];
 610         pixels[1] = cm[block[1]];
 611         pixels[2] = cm[block[2]];
 612         pixels[3] = cm[block[3]];
 613
 614         pixels += line_size;
 615         block += 8;
 616     }
 617 }
 618
 619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 620                                  int line_size)
 621 {
 622     int i;
 623     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 624
 625     /* read the pixels */
 626     for(i=0;i<2;i++) {
 627         pixels[0] = cm[block[0]];
 628         pixels[1] = cm[block[1]];
 629
 630         pixels += line_size;
 631         block += 8;
 632     }
 633 }
 634
 635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 636                                         uint8_t *restrict pixels,
 637                                         int line_size)
 638 {
 639     int i, j;
 640
 641     for (i = 0; i < 8; i++) {
 642         for (j = 0; j < 8; j++) {
 643             if (*block < -128)
 644                 *pixels = 0;
 645             else if (*block > 127)
 646                 *pixels = 255;
 647             else
 648                 *pixels = (uint8_t)(*block + 128);
 649             block++;
 650             pixels++;
 651         }
 652         pixels += (line_size - 8);
 653     }
 654 }
 655
 656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 657                           int line_size)
 658 {
 659     int i;
 660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 661
 662     /* read the pixels */
 663     for(i=0;i<8;i++) {
 664         pixels[0] = cm[pixels[0] + block[0]];
 665         pixels[1] = cm[pixels[1] + block[1]];
 666         pixels[2] = cm[pixels[2] + block[2]];
 667         pixels[3] = cm[pixels[3] + block[3]];
 668         pixels[4] = cm[pixels[4] + block[4]];
 669         pixels[5] = cm[pixels[5] + block[5]];
 670         pixels[6] = cm[pixels[6] + block[6]];
 671         pixels[7] = cm[pixels[7] + block[7]];
 672         pixels += line_size;
 673         block += 8;
 674     }
 675 }
 676
 677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 678                           int line_size)
 679 {
 680     int i;
 681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 682
 683     /* read the pixels */
 684     for(i=0;i<4;i++) {
 685         pixels[0] = cm[pixels[0] + block[0]];
 686         pixels[1] = cm[pixels[1] + block[1]];
 687         pixels[2] = cm[pixels[2] + block[2]];
 688         pixels[3] = cm[pixels[3] + block[3]];
 689         pixels += line_size;
 690         block += 8;
 691     }
 692 }
 693
 694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 695                           int line_size)
 696 {
 697     int i;
 698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 699
 700     /* read the pixels */
 701     for(i=0;i<2;i++) {
 702         pixels[0] = cm[pixels[0] + block[0]];
 703         pixels[1] = cm[pixels[1] + block[1]];
 704         pixels += line_size;
 705         block += 8;
 706     }
 707 }
 708
 709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 710 {
 711     int i;
 712     for(i=0;i<8;i++) {
 713         pixels[0] += block[0];
 714         pixels[1] += block[1];
 715         pixels[2] += block[2];
 716         pixels[3] += block[3];
 717         pixels[4] += block[4];
 718         pixels[5] += block[5];
 719         pixels[6] += block[6];
 720         pixels[7] += block[7];
 721         pixels += line_size;
 722         block += 8;
 723     }
 724 }
 725
 726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 727 {
 728     int i;
 729     for(i=0;i<4;i++) {
 730         pixels[0] += block[0];
 731         pixels[1] += block[1];
 732         pixels[2] += block[2];
 733         pixels[3] += block[3];
 734         pixels += line_size;
 735         block += 4;
 736     }
 737 }
 738
 739 static int sum_abs_dctelem_c(DCTELEM *block)
 740 {
 741     int sum=0, i;
 742     for(i=0; i<64; i++)
 743         sum+= FFABS(block[i]);
 744     return sum;
 745 }
 746
 747 #if 0
 748
 749 #define PIXOP2(OPNAME, OP) \
 750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 751 {\
 752     int i;\
 753     for(i=0; i<h; i++){\
 754         OP(*((uint64_t*)block), AV_RN64(pixels));\
 755         pixels+=line_size;\
 756         block +=line_size;\
 757     }\
 758 }\
 759 \
 760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 761 {\
 762     int i;\
 763     for(i=0; i<h; i++){\
 764         const uint64_t a= AV_RN64(pixels  );\
 765         const uint64_t b= AV_RN64(pixels+1);\
 766         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 767         pixels+=line_size;\
 768         block +=line_size;\
 769     }\
 770 }\
 771 \
 772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 773 {\
 774     int i;\
 775     for(i=0; i<h; i++){\
 776         const uint64_t a= AV_RN64(pixels  );\
 777         const uint64_t b= AV_RN64(pixels+1);\
 778         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 779         pixels+=line_size;\
 780         block +=line_size;\
 781     }\
 782 }\
 783 \
 784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 785 {\
 786     int i;\
 787     for(i=0; i<h; i++){\
 788         const uint64_t a= AV_RN64(pixels          );\
 789         const uint64_t b= AV_RN64(pixels+line_size);\
 790         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 791         pixels+=line_size;\
 792         block +=line_size;\
 793     }\
 794 }\
 795 \
 796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 797 {\
 798     int i;\
 799     for(i=0; i<h; i++){\
 800         const uint64_t a= AV_RN64(pixels          );\
 801         const uint64_t b= AV_RN64(pixels+line_size);\
 802         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 803         pixels+=line_size;\
 804         block +=line_size;\
 805     }\
 806 }\
 807 \
 808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 809 {\
 810         int i;\
 811         const uint64_t a= AV_RN64(pixels  );\
 812         const uint64_t b= AV_RN64(pixels+1);\
 813         uint64_t l0=  (a&0x0303030303030303ULL)\
 814                     + (b&0x0303030303030303ULL)\
 815                     + 0x0202020202020202ULL;\
 816         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 817                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 818         uint64_t l1,h1;\
 819 \
 820         pixels+=line_size;\
 821         for(i=0; i<h; i+=2){\
 822             uint64_t a= AV_RN64(pixels  );\
 823             uint64_t b= AV_RN64(pixels+1);\
 824             l1=  (a&0x0303030303030303ULL)\
 825                + (b&0x0303030303030303ULL);\
 826             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 827               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 828             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 829             pixels+=line_size;\
 830             block +=line_size;\
 831             a= AV_RN64(pixels  );\
 832             b= AV_RN64(pixels+1);\
 833             l0=  (a&0x0303030303030303ULL)\
 834                + (b&0x0303030303030303ULL)\
 835                + 0x0202020202020202ULL;\
 836             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 837               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 838             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 839             pixels+=line_size;\
 840             block +=line_size;\
 841         }\
 842 }\
 843 \
 844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 845 {\
 846         int i;\
 847         const uint64_t a= AV_RN64(pixels  );\
 848         const uint64_t b= AV_RN64(pixels+1);\
 849         uint64_t l0=  (a&0x0303030303030303ULL)\
 850                     + (b&0x0303030303030303ULL)\
 851                     + 0x0101010101010101ULL;\
 852         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 853                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 854         uint64_t l1,h1;\
 855 \
 856         pixels+=line_size;\
 857         for(i=0; i<h; i+=2){\
 858             uint64_t a= AV_RN64(pixels  );\
 859             uint64_t b= AV_RN64(pixels+1);\
 860             l1=  (a&0x0303030303030303ULL)\
 861                + (b&0x0303030303030303ULL);\
 862             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 863               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 864             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 865             pixels+=line_size;\
 866             block +=line_size;\
 867             a= AV_RN64(pixels  );\
 868             b= AV_RN64(pixels+1);\
 869             l0=  (a&0x0303030303030303ULL)\
 870                + (b&0x0303030303030303ULL)\
 871                + 0x0101010101010101ULL;\
 872             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 873               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 874             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 875             pixels+=line_size;\
 876             block +=line_size;\
 877         }\
 878 }\
 879 \
 880 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 887
 888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 889 #else // 64 bit variant
 890
 891 #define PIXOP2(OPNAME, OP) \
 892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 893     int i;\
 894     for(i=0; i<h; i++){\
 895         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 896         pixels+=line_size;\
 897         block +=line_size;\
 898     }\
 899 }\
 900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 901     int i;\
 902     for(i=0; i<h; i++){\
 903         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 904         pixels+=line_size;\
 905         block +=line_size;\
 906     }\
 907 }\
 908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 909     int i;\
 910     for(i=0; i<h; i++){\
 911         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 912         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 913         pixels+=line_size;\
 914         block +=line_size;\
 915     }\
 916 }\
 917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 918     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 919 }\
 920 \
 921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 922                                                 int src_stride1, int src_stride2, int h){\
 923     int i;\
 924     for(i=0; i<h; i++){\
 925         uint32_t a,b;\
 926         a= AV_RN32(&src1[i*src_stride1  ]);\
 927         b= AV_RN32(&src2[i*src_stride2  ]);\
 928         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 929         a= AV_RN32(&src1[i*src_stride1+4]);\
 930         b= AV_RN32(&src2[i*src_stride2+4]);\
 931         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 932     }\
 933 }\
 934 \
 935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 936                                                 int src_stride1, int src_stride2, int h){\
 937     int i;\
 938     for(i=0; i<h; i++){\
 939         uint32_t a,b;\
 940         a= AV_RN32(&src1[i*src_stride1  ]);\
 941         b= AV_RN32(&src2[i*src_stride2  ]);\
 942         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 943         a= AV_RN32(&src1[i*src_stride1+4]);\
 944         b= AV_RN32(&src2[i*src_stride2+4]);\
 945         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 946     }\
 947 }\
 948 \
 949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 950                                                 int src_stride1, int src_stride2, int h){\
 951     int i;\
 952     for(i=0; i<h; i++){\
 953         uint32_t a,b;\
 954         a= AV_RN32(&src1[i*src_stride1  ]);\
 955         b= AV_RN32(&src2[i*src_stride2  ]);\
 956         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 957     }\
 958 }\
 959 \
 960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 961                                                 int src_stride1, int src_stride2, int h){\
 962     int i;\
 963     for(i=0; i<h; i++){\
 964         uint32_t a,b;\
 965         a= AV_RN16(&src1[i*src_stride1  ]);\
 966         b= AV_RN16(&src2[i*src_stride2  ]);\
 967         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 968     }\
 969 }\
 970 \
 971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 972                                                 int src_stride1, int src_stride2, int h){\
 973     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 974     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 975 }\
 976 \
 977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 978                                                 int src_stride1, int src_stride2, int h){\
 979     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 980     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 981 }\
 982 \
 983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 984     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 985 }\
 986 \
 987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 988     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 989 }\
 990 \
 991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 992     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 993 }\
 994 \
 995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 996     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 997 }\
 998 \
 999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001     int i;\
1002     for(i=0; i<h; i++){\
1003         uint32_t a, b, c, d, l0, l1, h0, h1;\
1004         a= AV_RN32(&src1[i*src_stride1]);\
1005         b= AV_RN32(&src2[i*src_stride2]);\
1006         c= AV_RN32(&src3[i*src_stride3]);\
1007         d= AV_RN32(&src4[i*src_stride4]);\
1008         l0=  (a&0x03030303UL)\
1009            + (b&0x03030303UL)\
1010            + 0x02020202UL;\
1011         h0= ((a&0xFCFCFCFCUL)>>2)\
1012           + ((b&0xFCFCFCFCUL)>>2);\
1013         l1=  (c&0x03030303UL)\
1014            + (d&0x03030303UL);\
1015         h1= ((c&0xFCFCFCFCUL)>>2)\
1016           + ((d&0xFCFCFCFCUL)>>2);\
1017         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018         a= AV_RN32(&src1[i*src_stride1+4]);\
1019         b= AV_RN32(&src2[i*src_stride2+4]);\
1020         c= AV_RN32(&src3[i*src_stride3+4]);\
1021         d= AV_RN32(&src4[i*src_stride4+4]);\
1022         l0=  (a&0x03030303UL)\
1023            + (b&0x03030303UL)\
1024            + 0x02020202UL;\
1025         h0= ((a&0xFCFCFCFCUL)>>2)\
1026           + ((b&0xFCFCFCFCUL)>>2);\
1027         l1=  (c&0x03030303UL)\
1028            + (d&0x03030303UL);\
1029         h1= ((c&0xFCFCFCFCUL)>>2)\
1030           + ((d&0xFCFCFCFCUL)>>2);\
1031         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1032     }\
1033 }\
1034 \
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037 }\
1038 \
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041 }\
1042 \
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1049 }\
1050 \
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053     int i;\
1054     for(i=0; i<h; i++){\
1055         uint32_t a, b, c, d, l0, l1, h0, h1;\
1056         a= AV_RN32(&src1[i*src_stride1]);\
1057         b= AV_RN32(&src2[i*src_stride2]);\
1058         c= AV_RN32(&src3[i*src_stride3]);\
1059         d= AV_RN32(&src4[i*src_stride4]);\
1060         l0=  (a&0x03030303UL)\
1061            + (b&0x03030303UL)\
1062            + 0x01010101UL;\
1063         h0= ((a&0xFCFCFCFCUL)>>2)\
1064           + ((b&0xFCFCFCFCUL)>>2);\
1065         l1=  (c&0x03030303UL)\
1066            + (d&0x03030303UL);\
1067         h1= ((c&0xFCFCFCFCUL)>>2)\
1068           + ((d&0xFCFCFCFCUL)>>2);\
1069         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070         a= AV_RN32(&src1[i*src_stride1+4]);\
1071         b= AV_RN32(&src2[i*src_stride2+4]);\
1072         c= AV_RN32(&src3[i*src_stride3+4]);\
1073         d= AV_RN32(&src4[i*src_stride4+4]);\
1074         l0=  (a&0x03030303UL)\
1075            + (b&0x03030303UL)\
1076            + 0x01010101UL;\
1077         h0= ((a&0xFCFCFCFCUL)>>2)\
1078           + ((b&0xFCFCFCFCUL)>>2);\
1079         l1=  (c&0x03030303UL)\
1080            + (d&0x03030303UL);\
1081         h1= ((c&0xFCFCFCFCUL)>>2)\
1082           + ((d&0xFCFCFCFCUL)>>2);\
1083         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084     }\
1085 }\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 }\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1095 }\
1096 \
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 {\
1099         int i, a0, b0, a1, b1;\
1100         a0= pixels[0];\
1101         b0= pixels[1] + 2;\
1102         a0 += b0;\
1103         b0 += pixels[2];\
1104 \
1105         pixels+=line_size;\
1106         for(i=0; i<h; i+=2){\
1107             a1= pixels[0];\
1108             b1= pixels[1];\
1109             a1 += b1;\
1110             b1 += pixels[2];\
1111 \
1112             block[0]= (a1+a0)>>2; /* FIXME non put */\
1113             block[1]= (b1+b0)>>2;\
1114 \
1115             pixels+=line_size;\
1116             block +=line_size;\
1117 \
1118             a0= pixels[0];\
1119             b0= pixels[1] + 2;\
1120             a0 += b0;\
1121             b0 += pixels[2];\
1122 \
1123             block[0]= (a1+a0)>>2;\
1124             block[1]= (b1+b0)>>2;\
1125             pixels+=line_size;\
1126             block +=line_size;\
1127         }\
1128 }\
1129 \
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 {\
1132         int i;\
1133         const uint32_t a= AV_RN32(pixels  );\
1134         const uint32_t b= AV_RN32(pixels+1);\
1135         uint32_t l0=  (a&0x03030303UL)\
1136                     + (b&0x03030303UL)\
1137                     + 0x02020202UL;\
1138         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139                    + ((b&0xFCFCFCFCUL)>>2);\
1140         uint32_t l1,h1;\
1141 \
1142         pixels+=line_size;\
1143         for(i=0; i<h; i+=2){\
1144             uint32_t a= AV_RN32(pixels  );\
1145             uint32_t b= AV_RN32(pixels+1);\
1146             l1=  (a&0x03030303UL)\
1147                + (b&0x03030303UL);\
1148             h1= ((a&0xFCFCFCFCUL)>>2)\
1149               + ((b&0xFCFCFCFCUL)>>2);\
1150             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151             pixels+=line_size;\
1152             block +=line_size;\
1153             a= AV_RN32(pixels  );\
1154             b= AV_RN32(pixels+1);\
1155             l0=  (a&0x03030303UL)\
1156                + (b&0x03030303UL)\
1157                + 0x02020202UL;\
1158             h0= ((a&0xFCFCFCFCUL)>>2)\
1159               + ((b&0xFCFCFCFCUL)>>2);\
1160             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161             pixels+=line_size;\
1162             block +=line_size;\
1163         }\
1164 }\
1165 \
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 {\
1168     int j;\
1169     for(j=0; j<2; j++){\
1170         int i;\
1171         const uint32_t a= AV_RN32(pixels  );\
1172         const uint32_t b= AV_RN32(pixels+1);\
1173         uint32_t l0=  (a&0x03030303UL)\
1174                     + (b&0x03030303UL)\
1175                     + 0x02020202UL;\
1176         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177                    + ((b&0xFCFCFCFCUL)>>2);\
1178         uint32_t l1,h1;\
1179 \
1180         pixels+=line_size;\
1181         for(i=0; i<h; i+=2){\
1182             uint32_t a= AV_RN32(pixels  );\
1183             uint32_t b= AV_RN32(pixels+1);\
1184             l1=  (a&0x03030303UL)\
1185                + (b&0x03030303UL);\
1186             h1= ((a&0xFCFCFCFCUL)>>2)\
1187               + ((b&0xFCFCFCFCUL)>>2);\
1188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189             pixels+=line_size;\
1190             block +=line_size;\
1191             a= AV_RN32(pixels  );\
1192             b= AV_RN32(pixels+1);\
1193             l0=  (a&0x03030303UL)\
1194                + (b&0x03030303UL)\
1195                + 0x02020202UL;\
1196             h0= ((a&0xFCFCFCFCUL)>>2)\
1197               + ((b&0xFCFCFCFCUL)>>2);\
1198             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199             pixels+=line_size;\
1200             block +=line_size;\
1201         }\
1202         pixels+=4-line_size*(h+1);\
1203         block +=4-line_size*h;\
1204     }\
1205 }\
1206 \
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 {\
1209     int j;\
1210     for(j=0; j<2; j++){\
1211         int i;\
1212         const uint32_t a= AV_RN32(pixels  );\
1213         const uint32_t b= AV_RN32(pixels+1);\
1214         uint32_t l0=  (a&0x03030303UL)\
1215                     + (b&0x03030303UL)\
1216                     + 0x01010101UL;\
1217         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218                    + ((b&0xFCFCFCFCUL)>>2);\
1219         uint32_t l1,h1;\
1220 \
1221         pixels+=line_size;\
1222         for(i=0; i<h; i+=2){\
1223             uint32_t a= AV_RN32(pixels  );\
1224             uint32_t b= AV_RN32(pixels+1);\
1225             l1=  (a&0x03030303UL)\
1226                + (b&0x03030303UL);\
1227             h1= ((a&0xFCFCFCFCUL)>>2)\
1228               + ((b&0xFCFCFCFCUL)>>2);\
1229             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230             pixels+=line_size;\
1231             block +=line_size;\
1232             a= AV_RN32(pixels  );\
1233             b= AV_RN32(pixels+1);\
1234             l0=  (a&0x03030303UL)\
1235                + (b&0x03030303UL)\
1236                + 0x01010101UL;\
1237             h0= ((a&0xFCFCFCFCUL)>>2)\
1238               + ((b&0xFCFCFCFCUL)>>2);\
1239             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240             pixels+=line_size;\
1241             block +=line_size;\
1242         }\
1243         pixels+=4-line_size*(h+1);\
1244         block +=4-line_size*h;\
1245     }\
1246 }\
1247 \
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1260
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1265
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1271 }
1272
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1275 }
1276
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 {
1279     const int A=(16-x16)*(16-y16);
1280     const int B=(   x16)*(16-y16);
1281     const int C=(16-x16)*(   y16);
1282     const int D=(   x16)*(   y16);
1283     int i;
1284
1285     for(i=0; i<h; i++)
1286     {
1287         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295         dst+= stride;
1296         src+= stride;
1297     }
1298 }
1299
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 {
1303     int y, vx, vy;
1304     const int s= 1<<shift;
1305
1306     width--;
1307     height--;
1308
1309     for(y=0; y<h; y++){
1310         int x;
1311
1312         vx= ox;
1313         vy= oy;
1314         for(x=0; x<8; x++){ //XXX FIXME optimize
1315             int src_x, src_y, frac_x, frac_y, index;
1316
1317             src_x= vx>>16;
1318             src_y= vy>>16;
1319             frac_x= src_x&(s-1);
1320             frac_y= src_y&(s-1);
1321             src_x>>=shift;
1322             src_y>>=shift;
1323
1324             if((unsigned)src_x < width){
1325                 if((unsigned)src_y < height){
1326                     index= src_x + src_y*stride;
1327                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1328                                            + src[index       +1]*   frac_x )*(s-frac_y)
1329                                         + (  src[index+stride  ]*(s-frac_x)
1330                                            + src[index+stride+1]*   frac_x )*   frac_y
1331                                         + r)>>(shift*2);
1332                 }else{
1333                     index= src_x + av_clip(src_y, 0, height)*stride;
1334                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1335                                           + src[index       +1]*   frac_x )*s
1336                                         + r)>>(shift*2);
1337                 }
1338             }else{
1339                 if((unsigned)src_y < height){
1340                     index= av_clip(src_x, 0, width) + src_y*stride;
1341                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1342                                            + src[index+stride  ]*   frac_y )*s
1343                                         + r)>>(shift*2);
1344                 }else{
1345                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346                     dst[y*stride + x]=    src[index         ];
1347                 }
1348             }
1349
1350             vx+= dxx;
1351             vy+= dyx;
1352         }
1353         ox += dxy;
1354         oy += dyy;
1355     }
1356 }
1357
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359     switch(width){
1360     case 2: put_pixels2_c (dst, src, stride, height); break;
1361     case 4: put_pixels4_c (dst, src, stride, height); break;
1362     case 8: put_pixels8_c (dst, src, stride, height); break;
1363     case 16:put_pixels16_c(dst, src, stride, height); break;
1364     }
1365 }
1366
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368     int i,j;
1369     for (i=0; i < height; i++) {
1370       for (j=0; j < width; j++) {
1371         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372       }
1373       src += stride;
1374       dst += stride;
1375     }
1376 }
1377
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379     int i,j;
1380     for (i=0; i < height; i++) {
1381       for (j=0; j < width; j++) {
1382         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383       }
1384       src += stride;
1385       dst += stride;
1386     }
1387 }
1388
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390     int i,j;
1391     for (i=0; i < height; i++) {
1392       for (j=0; j < width; j++) {
1393         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394       }
1395       src += stride;
1396       dst += stride;
1397     }
1398 }
1399
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401     int i,j;
1402     for (i=0; i < height; i++) {
1403       for (j=0; j < width; j++) {
1404         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405       }
1406       src += stride;
1407       dst += stride;
1408     }
1409 }
1410
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412     int i,j;
1413     for (i=0; i < height; i++) {
1414       for (j=0; j < width; j++) {
1415         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416       }
1417       src += stride;
1418       dst += stride;
1419     }
1420 }
1421
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423     int i,j;
1424     for (i=0; i < height; i++) {
1425       for (j=0; j < width; j++) {
1426         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427       }
1428       src += stride;
1429       dst += stride;
1430     }
1431 }
1432
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434     int i,j;
1435     for (i=0; i < height; i++) {
1436       for (j=0; j < width; j++) {
1437         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438       }
1439       src += stride;
1440       dst += stride;
1441     }
1442 }
1443
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445     int i,j;
1446     for (i=0; i < height; i++) {
1447       for (j=0; j < width; j++) {
1448         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449       }
1450       src += stride;
1451       dst += stride;
1452     }
1453 }
1454
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456     switch(width){
1457     case 2: avg_pixels2_c (dst, src, stride, height); break;
1458     case 4: avg_pixels4_c (dst, src, stride, height); break;
1459     case 8: avg_pixels8_c (dst, src, stride, height); break;
1460     case 16:avg_pixels16_c(dst, src, stride, height); break;
1461     }
1462 }
1463
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465     int i,j;
1466     for (i=0; i < height; i++) {
1467       for (j=0; j < width; j++) {
1468         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469       }
1470       src += stride;
1471       dst += stride;
1472     }
1473 }
1474
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476     int i,j;
1477     for (i=0; i < height; i++) {
1478       for (j=0; j < width; j++) {
1479         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480       }
1481       src += stride;
1482       dst += stride;
1483     }
1484 }
1485
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487     int i,j;
1488     for (i=0; i < height; i++) {
1489       for (j=0; j < width; j++) {
1490         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491       }
1492       src += stride;
1493       dst += stride;
1494     }
1495 }
1496
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498     int i,j;
1499     for (i=0; i < height; i++) {
1500       for (j=0; j < width; j++) {
1501         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502       }
1503       src += stride;
1504       dst += stride;
1505     }
1506 }
1507
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509     int i,j;
1510     for (i=0; i < height; i++) {
1511       for (j=0; j < width; j++) {
1512         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513       }
1514       src += stride;
1515       dst += stride;
1516     }
1517 }
1518
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520     int i,j;
1521     for (i=0; i < height; i++) {
1522       for (j=0; j < width; j++) {
1523         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524       }
1525       src += stride;
1526       dst += stride;
1527     }
1528 }
1529
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531     int i,j;
1532     for (i=0; i < height; i++) {
1533       for (j=0; j < width; j++) {
1534         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535       }
1536       src += stride;
1537       dst += stride;
1538     }
1539 }
1540
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542     int i,j;
1543     for (i=0; i < height; i++) {
1544       for (j=0; j < width; j++) {
1545         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546       }
1547       src += stride;
1548       dst += stride;
1549     }
1550 }
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1572
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575     const int A=(8-x)*(8-y);\
1576     const int B=(  x)*(8-y);\
1577     const int C=(8-x)*(  y);\
1578     const int D=(  x)*(  y);\
1579     int i;\
1580     \
1581     assert(x<8 && y<8 && x>=0 && y>=0);\
1582 \
1583     if(D){\
1584         for(i=0; i<h; i++){\
1585             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587             dst+= stride;\
1588             src+= stride;\
1589         }\
1590     }else{\
1591         const int E= B+C;\
1592         const int step= C ? stride : 1;\
1593         for(i=0; i<h; i++){\
1594             OP(dst[0], (A*src[0] + E*src[step+0]));\
1595             OP(dst[1], (A*src[1] + E*src[step+1]));\
1596             dst+= stride;\
1597             src+= stride;\
1598         }\
1599     }\
1600 }\
1601 \
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603     const int A=(8-x)*(8-y);\
1604     const int B=(  x)*(8-y);\
1605     const int C=(8-x)*(  y);\
1606     const int D=(  x)*(  y);\
1607     int i;\
1608     \
1609     assert(x<8 && y<8 && x>=0 && y>=0);\
1610 \
1611     if(D){\
1612         for(i=0; i<h; i++){\
1613             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617             dst+= stride;\
1618             src+= stride;\
1619         }\
1620     }else{\
1621         const int E= B+C;\
1622         const int step= C ? stride : 1;\
1623         for(i=0; i<h; i++){\
1624             OP(dst[0], (A*src[0] + E*src[step+0]));\
1625             OP(dst[1], (A*src[1] + E*src[step+1]));\
1626             OP(dst[2], (A*src[2] + E*src[step+2]));\
1627             OP(dst[3], (A*src[3] + E*src[step+3]));\
1628             dst+= stride;\
1629             src+= stride;\
1630         }\
1631     }\
1632 }\
1633 \
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635     const int A=(8-x)*(8-y);\
1636     const int B=(  x)*(8-y);\
1637     const int C=(8-x)*(  y);\
1638     const int D=(  x)*(  y);\
1639     int i;\
1640     \
1641     assert(x<8 && y<8 && x>=0 && y>=0);\
1642 \
1643     if(D){\
1644         for(i=0; i<h; i++){\
1645             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653             dst+= stride;\
1654             src+= stride;\
1655         }\
1656     }else{\
1657         const int E= B+C;\
1658         const int step= C ? stride : 1;\
1659         for(i=0; i<h; i++){\
1660             OP(dst[0], (A*src[0] + E*src[step+0]));\
1661             OP(dst[1], (A*src[1] + E*src[step+1]));\
1662             OP(dst[2], (A*src[2] + E*src[step+2]));\
1663             OP(dst[3], (A*src[3] + E*src[step+3]));\
1664             OP(dst[4], (A*src[4] + E*src[step+4]));\
1665             OP(dst[5], (A*src[5] + E*src[step+5]));\
1666             OP(dst[6], (A*src[6] + E*src[step+6]));\
1667             OP(dst[7], (A*src[7] + E*src[step+7]));\
1668             dst+= stride;\
1669             src+= stride;\
1670         }\
1671     }\
1672 }
1673
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1676
1677 H264_CHROMA_MC(put_       , op_put)
1678 H264_CHROMA_MC(avg_       , op_avg)
1679 #undef op_avg
1680 #undef op_put
1681
1682 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683     const int A=(8-x)*(8-y);
1684     const int B=(  x)*(8-y);
1685     const int C=(8-x)*(  y);
1686     const int D=(  x)*(  y);
1687     int i;
1688
1689     assert(x<8 && y<8 && x>=0 && y>=0);
1690
1691     for(i=0; i<h; i++)
1692     {
1693         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701         dst+= stride;
1702         src+= stride;
1703     }
1704 }
1705
1706 #define QPEL_MC(r, OPNAME, RND, OP) \
1707 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1709     int i;\
1710     for(i=0; i<h; i++)\
1711     {\
1712         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1720         dst+=dstStride;\
1721         src+=srcStride;\
1722     }\
1723 }\
1724 \
1725 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726     const int w=8;\
1727     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728     int i;\
1729     for(i=0; i<w; i++)\
1730     {\
1731         const int src0= src[0*srcStride];\
1732         const int src1= src[1*srcStride];\
1733         const int src2= src[2*srcStride];\
1734         const int src3= src[3*srcStride];\
1735         const int src4= src[4*srcStride];\
1736         const int src5= src[5*srcStride];\
1737         const int src6= src[6*srcStride];\
1738         const int src7= src[7*srcStride];\
1739         const int src8= src[8*srcStride];\
1740         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1748         dst++;\
1749         src++;\
1750     }\
1751 }\
1752 \
1753 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755     int i;\
1756     \
1757     for(i=0; i<h; i++)\
1758     {\
1759         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1775         dst+=dstStride;\
1776         src+=srcStride;\
1777     }\
1778 }\
1779 \
1780 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782     int i;\
1783     const int w=16;\
1784     for(i=0; i<w; i++)\
1785     {\
1786         const int src0= src[0*srcStride];\
1787         const int src1= src[1*srcStride];\
1788         const int src2= src[2*srcStride];\
1789         const int src3= src[3*srcStride];\
1790         const int src4= src[4*srcStride];\
1791         const int src5= src[5*srcStride];\
1792         const int src6= src[6*srcStride];\
1793         const int src7= src[7*srcStride];\
1794         const int src8= src[8*srcStride];\
1795         const int src9= src[9*srcStride];\
1796         const int src10= src[10*srcStride];\
1797         const int src11= src[11*srcStride];\
1798         const int src12= src[12*srcStride];\
1799         const int src13= src[13*srcStride];\
1800         const int src14= src[14*srcStride];\
1801         const int src15= src[15*srcStride];\
1802         const int src16= src[16*srcStride];\
1803         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1819         dst++;\
1820         src++;\
1821     }\
1822 }\
1823 \
1824 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825     OPNAME ## pixels8_c(dst, src, stride, 8);\
1826 }\
1827 \
1828 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829     uint8_t half[64];\
1830     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1832 }\
1833 \
1834 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1836 }\
1837 \
1838 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839     uint8_t half[64];\
1840     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1842 }\
1843 \
1844 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845     uint8_t full[16*9];\
1846     uint8_t half[64];\
1847     copy_block9(full, src, 16, stride, 9);\
1848     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1850 }\
1851 \
1852 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853     uint8_t full[16*9];\
1854     copy_block9(full, src, 16, stride, 9);\
1855     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1856 }\
1857 \
1858 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859     uint8_t full[16*9];\
1860     uint8_t half[64];\
1861     copy_block9(full, src, 16, stride, 9);\
1862     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864 }\
1865 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866     uint8_t full[16*9];\
1867     uint8_t halfH[72];\
1868     uint8_t halfV[64];\
1869     uint8_t halfHV[64];\
1870     copy_block9(full, src, 16, stride, 9);\
1871     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 }\
1876 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877     uint8_t full[16*9];\
1878     uint8_t halfH[72];\
1879     uint8_t halfHV[64];\
1880     copy_block9(full, src, 16, stride, 9);\
1881     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885 }\
1886 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887     uint8_t full[16*9];\
1888     uint8_t halfH[72];\
1889     uint8_t halfV[64];\
1890     uint8_t halfHV[64];\
1891     copy_block9(full, src, 16, stride, 9);\
1892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896 }\
1897 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898     uint8_t full[16*9];\
1899     uint8_t halfH[72];\
1900     uint8_t halfHV[64];\
1901     copy_block9(full, src, 16, stride, 9);\
1902     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906 }\
1907 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908     uint8_t full[16*9];\
1909     uint8_t halfH[72];\
1910     uint8_t halfV[64];\
1911     uint8_t halfHV[64];\
1912     copy_block9(full, src, 16, stride, 9);\
1913     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917 }\
1918 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919     uint8_t full[16*9];\
1920     uint8_t halfH[72];\
1921     uint8_t halfHV[64];\
1922     copy_block9(full, src, 16, stride, 9);\
1923     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927 }\
1928 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929     uint8_t full[16*9];\
1930     uint8_t halfH[72];\
1931     uint8_t halfV[64];\
1932     uint8_t halfHV[64];\
1933     copy_block9(full, src, 16, stride, 9);\
1934     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1935     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938 }\
1939 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940     uint8_t full[16*9];\
1941     uint8_t halfH[72];\
1942     uint8_t halfHV[64];\
1943     copy_block9(full, src, 16, stride, 9);\
1944     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948 }\
1949 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950     uint8_t halfH[72];\
1951     uint8_t halfHV[64];\
1952     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 }\
1956 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957     uint8_t halfH[72];\
1958     uint8_t halfHV[64];\
1959     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962 }\
1963 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964     uint8_t full[16*9];\
1965     uint8_t halfH[72];\
1966     uint8_t halfV[64];\
1967     uint8_t halfHV[64];\
1968     copy_block9(full, src, 16, stride, 9);\
1969     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973 }\
1974 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975     uint8_t full[16*9];\
1976     uint8_t halfH[72];\
1977     copy_block9(full, src, 16, stride, 9);\
1978     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981 }\
1982 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983     uint8_t full[16*9];\
1984     uint8_t halfH[72];\
1985     uint8_t halfV[64];\
1986     uint8_t halfHV[64];\
1987     copy_block9(full, src, 16, stride, 9);\
1988     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992 }\
1993 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t full[16*9];\
1995     uint8_t halfH[72];\
1996     copy_block9(full, src, 16, stride, 9);\
1997     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000 }\
2001 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002     uint8_t halfH[72];\
2003     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 }\
2006 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007     OPNAME ## pixels16_c(dst, src, stride, 16);\
2008 }\
2009 \
2010 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011     uint8_t half[256];\
2012     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2014 }\
2015 \
2016 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2018 }\
2019 \
2020 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021     uint8_t half[256];\
2022     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2024 }\
2025 \
2026 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027     uint8_t full[24*17];\
2028     uint8_t half[256];\
2029     copy_block17(full, src, 24, stride, 17);\
2030     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2032 }\
2033 \
2034 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035     uint8_t full[24*17];\
2036     copy_block17(full, src, 24, stride, 17);\
2037     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2038 }\
2039 \
2040 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041     uint8_t full[24*17];\
2042     uint8_t half[256];\
2043     copy_block17(full, src, 24, stride, 17);\
2044     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046 }\
2047 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048     uint8_t full[24*17];\
2049     uint8_t halfH[272];\
2050     uint8_t halfV[256];\
2051     uint8_t halfHV[256];\
2052     copy_block17(full, src, 24, stride, 17);\
2053     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 }\
2058 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059     uint8_t full[24*17];\
2060     uint8_t halfH[272];\
2061     uint8_t halfHV[256];\
2062     copy_block17(full, src, 24, stride, 17);\
2063     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067 }\
2068 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069     uint8_t full[24*17];\
2070     uint8_t halfH[272];\
2071     uint8_t halfV[256];\
2072     uint8_t halfHV[256];\
2073     copy_block17(full, src, 24, stride, 17);\
2074     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078 }\
2079 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080     uint8_t full[24*17];\
2081     uint8_t halfH[272];\
2082     uint8_t halfHV[256];\
2083     copy_block17(full, src, 24, stride, 17);\
2084     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088 }\
2089 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090     uint8_t full[24*17];\
2091     uint8_t halfH[272];\
2092     uint8_t halfV[256];\
2093     uint8_t halfHV[256];\
2094     copy_block17(full, src, 24, stride, 17);\
2095     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099 }\
2100 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101     uint8_t full[24*17];\
2102     uint8_t halfH[272];\
2103     uint8_t halfHV[256];\
2104     copy_block17(full, src, 24, stride, 17);\
2105     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109 }\
2110 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111     uint8_t full[24*17];\
2112     uint8_t halfH[272];\
2113     uint8_t halfV[256];\
2114     uint8_t halfHV[256];\
2115     copy_block17(full, src, 24, stride, 17);\
2116     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2117     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120 }\
2121 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122     uint8_t full[24*17];\
2123     uint8_t halfH[272];\
2124     uint8_t halfHV[256];\
2125     copy_block17(full, src, 24, stride, 17);\
2126     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130 }\
2131 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132     uint8_t halfH[272];\
2133     uint8_t halfHV[256];\
2134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 }\
2138 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139     uint8_t halfH[272];\
2140     uint8_t halfHV[256];\
2141     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144 }\
2145 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146     uint8_t full[24*17];\
2147     uint8_t halfH[272];\
2148     uint8_t halfV[256];\
2149     uint8_t halfHV[256];\
2150     copy_block17(full, src, 24, stride, 17);\
2151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155 }\
2156 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157     uint8_t full[24*17];\
2158     uint8_t halfH[272];\
2159     copy_block17(full, src, 24, stride, 17);\
2160     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163 }\
2164 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165     uint8_t full[24*17];\
2166     uint8_t halfH[272];\
2167     uint8_t halfV[256];\
2168     uint8_t halfHV[256];\
2169     copy_block17(full, src, 24, stride, 17);\
2170     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174 }\
2175 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176     uint8_t full[24*17];\
2177     uint8_t halfH[272];\
2178     copy_block17(full, src, 24, stride, 17);\
2179     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 }\
2183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184     uint8_t halfH[272];\
2185     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2187 }
2188
2189 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191 #define op_put(a, b) a = cm[((b) + 16)>>5]
2192 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193
2194 QPEL_MC(0, put_       , _       , op_put)
2195 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196 QPEL_MC(0, avg_       , _       , op_avg)
2197 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2198 #undef op_avg
2199 #undef op_avg_no_rnd
2200 #undef op_put
2201 #undef op_put_no_rnd
2202
2203 #if 1
2204 #define H264_LOWPASS(OPNAME, OP, OP2) \
2205 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206     const int h=2;\
2207     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208     int i;\
2209     for(i=0; i<h; i++)\
2210     {\
2211         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213         dst+=dstStride;\
2214         src+=srcStride;\
2215     }\
2216 }\
2217 \
2218 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219     const int w=2;\
2220     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221     int i;\
2222     for(i=0; i<w; i++)\
2223     {\
2224         const int srcB= src[-2*srcStride];\
2225         const int srcA= src[-1*srcStride];\
2226         const int src0= src[0 *srcStride];\
2227         const int src1= src[1 *srcStride];\
2228         const int src2= src[2 *srcStride];\
2229         const int src3= src[3 *srcStride];\
2230         const int src4= src[4 *srcStride];\
2231         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2233         dst++;\
2234         src++;\
2235     }\
2236 }\
2237 \
2238 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2239     const int h=2;\
2240     const int w=2;\
2241     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242     int i;\
2243     src -= 2*srcStride;\
2244     for(i=0; i<h+5; i++)\
2245     {\
2246         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2248         tmp+=tmpStride;\
2249         src+=srcStride;\
2250     }\
2251     tmp -= tmpStride*(h+5-2);\
2252     for(i=0; i<w; i++)\
2253     {\
2254         const int tmpB= tmp[-2*tmpStride];\
2255         const int tmpA= tmp[-1*tmpStride];\
2256         const int tmp0= tmp[0 *tmpStride];\
2257         const int tmp1= tmp[1 *tmpStride];\
2258         const int tmp2= tmp[2 *tmpStride];\
2259         const int tmp3= tmp[3 *tmpStride];\
2260         const int tmp4= tmp[4 *tmpStride];\
2261         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2263         dst++;\
2264         tmp++;\
2265     }\
2266 }\
2267 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268     const int h=4;\
2269     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270     int i;\
2271     for(i=0; i<h; i++)\
2272     {\
2273         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2277         dst+=dstStride;\
2278         src+=srcStride;\
2279     }\
2280 }\
2281 \
2282 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283     const int w=4;\
2284     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285     int i;\
2286     for(i=0; i<w; i++)\
2287     {\
2288         const int srcB= src[-2*srcStride];\
2289         const int srcA= src[-1*srcStride];\
2290         const int src0= src[0 *srcStride];\
2291         const int src1= src[1 *srcStride];\
2292         const int src2= src[2 *srcStride];\
2293         const int src3= src[3 *srcStride];\
2294         const int src4= src[4 *srcStride];\
2295         const int src5= src[5 *srcStride];\
2296         const int src6= src[6 *srcStride];\
2297         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2301         dst++;\
2302         src++;\
2303     }\
2304 }\
2305 \
2306 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2307     const int h=4;\
2308     const int w=4;\
2309     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310     int i;\
2311     src -= 2*srcStride;\
2312     for(i=0; i<h+5; i++)\
2313     {\
2314         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2318         tmp+=tmpStride;\
2319         src+=srcStride;\
2320     }\
2321     tmp -= tmpStride*(h+5-2);\
2322     for(i=0; i<w; i++)\
2323     {\
2324         const int tmpB= tmp[-2*tmpStride];\
2325         const int tmpA= tmp[-1*tmpStride];\
2326         const int tmp0= tmp[0 *tmpStride];\
2327         const int tmp1= tmp[1 *tmpStride];\
2328         const int tmp2= tmp[2 *tmpStride];\
2329         const int tmp3= tmp[3 *tmpStride];\
2330         const int tmp4= tmp[4 *tmpStride];\
2331         const int tmp5= tmp[5 *tmpStride];\
2332         const int tmp6= tmp[6 *tmpStride];\
2333         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2337         dst++;\
2338         tmp++;\
2339     }\
2340 }\
2341 \
2342 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343     const int h=8;\
2344     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2345     int i;\
2346     for(i=0; i<h; i++)\
2347     {\
2348         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2356         dst+=dstStride;\
2357         src+=srcStride;\
2358     }\
2359 }\
2360 \
2361 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362     const int w=8;\
2363     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2364     int i;\
2365     for(i=0; i<w; i++)\
2366     {\
2367         const int srcB= src[-2*srcStride];\
2368         const int srcA= src[-1*srcStride];\
2369         const int src0= src[0 *srcStride];\
2370         const int src1= src[1 *srcStride];\
2371         const int src2= src[2 *srcStride];\
2372         const int src3= src[3 *srcStride];\
2373         const int src4= src[4 *srcStride];\
2374         const int src5= src[5 *srcStride];\
2375         const int src6= src[6 *srcStride];\
2376         const int src7= src[7 *srcStride];\
2377         const int src8= src[8 *srcStride];\
2378         const int src9= src[9 *srcStride];\
2379         const int src10=src[10*srcStride];\
2380         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2388         dst++;\
2389         src++;\
2390     }\
2391 }\
2392 \
2393 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2394     const int h=8;\
2395     const int w=8;\
2396     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397     int i;\
2398     src -= 2*srcStride;\
2399     for(i=0; i<h+5; i++)\
2400     {\
2401         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2409         tmp+=tmpStride;\
2410         src+=srcStride;\
2411     }\
2412     tmp -= tmpStride*(h+5-2);\
2413     for(i=0; i<w; i++)\
2414     {\
2415         const int tmpB= tmp[-2*tmpStride];\
2416         const int tmpA= tmp[-1*tmpStride];\
2417         const int tmp0= tmp[0 *tmpStride];\
2418         const int tmp1= tmp[1 *tmpStride];\
2419         const int tmp2= tmp[2 *tmpStride];\
2420         const int tmp3= tmp[3 *tmpStride];\
2421         const int tmp4= tmp[4 *tmpStride];\
2422         const int tmp5= tmp[5 *tmpStride];\
2423         const int tmp6= tmp[6 *tmpStride];\
2424         const int tmp7= tmp[7 *tmpStride];\
2425         const int tmp8= tmp[8 *tmpStride];\
2426         const int tmp9= tmp[9 *tmpStride];\
2427         const int tmp10=tmp[10*tmpStride];\
2428         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2436         dst++;\
2437         tmp++;\
2438     }\
2439 }\
2440 \
2441 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2443     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444     src += 8*srcStride;\
2445     dst += 8*dstStride;\
2446     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2447     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2448 }\
2449 \
2450 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2452     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453     src += 8*srcStride;\
2454     dst += 8*dstStride;\
2455     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2456     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2457 }\
2458 \
2459 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2461     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462     src += 8*srcStride;\
2463     dst += 8*dstStride;\
2464     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2465     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2466 }\
2467
2468 #define H264_MC(OPNAME, SIZE) \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2471 }\
2472 \
2473 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474     uint8_t half[SIZE*SIZE];\
2475     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2477 }\
2478 \
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2481 }\
2482 \
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484     uint8_t half[SIZE*SIZE];\
2485     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2487 }\
2488 \
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490     uint8_t full[SIZE*(SIZE+5)];\
2491     uint8_t * const full_mid= full + SIZE*2;\
2492     uint8_t half[SIZE*SIZE];\
2493     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2496 }\
2497 \
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499     uint8_t full[SIZE*(SIZE+5)];\
2500     uint8_t * const full_mid= full + SIZE*2;\
2501     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2502     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2503 }\
2504 \
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506     uint8_t full[SIZE*(SIZE+5)];\
2507     uint8_t * const full_mid= full + SIZE*2;\
2508     uint8_t half[SIZE*SIZE];\
2509     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2510     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2512 }\
2513 \
2514 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515     uint8_t full[SIZE*(SIZE+5)];\
2516     uint8_t * const full_mid= full + SIZE*2;\
2517     uint8_t halfH[SIZE*SIZE];\
2518     uint8_t halfV[SIZE*SIZE];\
2519     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2521     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2523 }\
2524 \
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526     uint8_t full[SIZE*(SIZE+5)];\
2527     uint8_t * const full_mid= full + SIZE*2;\
2528     uint8_t halfH[SIZE*SIZE];\
2529     uint8_t halfV[SIZE*SIZE];\
2530     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2532     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2534 }\
2535 \
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537     uint8_t full[SIZE*(SIZE+5)];\
2538     uint8_t * const full_mid= full + SIZE*2;\
2539     uint8_t halfH[SIZE*SIZE];\
2540     uint8_t halfV[SIZE*SIZE];\
2541     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2543     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2545 }\
2546 \
2547 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548     uint8_t full[SIZE*(SIZE+5)];\
2549     uint8_t * const full_mid= full + SIZE*2;\
2550     uint8_t halfH[SIZE*SIZE];\
2551     uint8_t halfV[SIZE*SIZE];\
2552     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2554     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2556 }\
2557 \
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559     int16_t tmp[SIZE*(SIZE+5)];\
2560     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2561 }\
2562 \
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564     int16_t tmp[SIZE*(SIZE+5)];\
2565     uint8_t halfH[SIZE*SIZE];\
2566     uint8_t halfHV[SIZE*SIZE];\
2567     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2570 }\
2571 \
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573     int16_t tmp[SIZE*(SIZE+5)];\
2574     uint8_t halfH[SIZE*SIZE];\
2575     uint8_t halfHV[SIZE*SIZE];\
2576     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2579 }\
2580 \
2581 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582     uint8_t full[SIZE*(SIZE+5)];\
2583     uint8_t * const full_mid= full + SIZE*2;\
2584     int16_t tmp[SIZE*(SIZE+5)];\
2585     uint8_t halfV[SIZE*SIZE];\
2586     uint8_t halfHV[SIZE*SIZE];\
2587     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2588     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2591 }\
2592 \
2593 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594     uint8_t full[SIZE*(SIZE+5)];\
2595     uint8_t * const full_mid= full + SIZE*2;\
2596     int16_t tmp[SIZE*(SIZE+5)];\
2597     uint8_t halfV[SIZE*SIZE];\
2598     uint8_t halfHV[SIZE*SIZE];\
2599     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2600     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2603 }\
2604
2605 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2608 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2610
2611 H264_LOWPASS(put_       , op_put, op2_put)
2612 H264_LOWPASS(avg_       , op_avg, op2_avg)
2613 H264_MC(put_, 2)
2614 H264_MC(put_, 4)
2615 H264_MC(put_, 8)
2616 H264_MC(put_, 16)
2617 H264_MC(avg_, 4)
2618 H264_MC(avg_, 8)
2619 H264_MC(avg_, 16)
2620
2621 #undef op_avg
2622 #undef op_put
2623 #undef op2_avg
2624 #undef op2_put
2625 #endif
2626
2627 #define op_scale1(x)  block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628 #define op_scale2(x)  dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629 #define H264_WEIGHT(W,H) \
2630 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631     int y; \
2632     offset <<= log2_denom; \
2633     if(log2_denom) offset += 1<<(log2_denom-1); \
2634     for(y=0; y<H; y++, block += stride){ \
2635         op_scale1(0); \
2636         op_scale1(1); \
2637         if(W==2) continue; \
2638         op_scale1(2); \
2639         op_scale1(3); \
2640         if(W==4) continue; \
2641         op_scale1(4); \
2642         op_scale1(5); \
2643         op_scale1(6); \
2644         op_scale1(7); \
2645         if(W==8) continue; \
2646         op_scale1(8); \
2647         op_scale1(9); \
2648         op_scale1(10); \
2649         op_scale1(11); \
2650         op_scale1(12); \
2651         op_scale1(13); \
2652         op_scale1(14); \
2653         op_scale1(15); \
2654     } \
2655 } \
2656 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657     int y; \
2658     offset = ((offset + 1) | 1) << log2_denom; \
2659     for(y=0; y<H; y++, dst += stride, src += stride){ \
2660         op_scale2(0); \
2661         op_scale2(1); \
2662         if(W==2) continue; \
2663         op_scale2(2); \
2664         op_scale2(3); \
2665         if(W==4) continue; \
2666         op_scale2(4); \
2667         op_scale2(5); \
2668         op_scale2(6); \
2669         op_scale2(7); \
2670         if(W==8) continue; \
2671         op_scale2(8); \
2672         op_scale2(9); \
2673         op_scale2(10); \
2674         op_scale2(11); \
2675         op_scale2(12); \
2676         op_scale2(13); \
2677         op_scale2(14); \
2678         op_scale2(15); \
2679     } \
2680 }
2681
2682 H264_WEIGHT(16,16)
2683 H264_WEIGHT(16,8)
2684 H264_WEIGHT(8,16)
2685 H264_WEIGHT(8,8)
2686 H264_WEIGHT(8,4)
2687 H264_WEIGHT(4,8)
2688 H264_WEIGHT(4,4)
2689 H264_WEIGHT(4,2)
2690 H264_WEIGHT(2,4)
2691 H264_WEIGHT(2,2)
2692
2693 #undef op_scale1
2694 #undef op_scale2
2695 #undef H264_WEIGHT
2696
2697 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2699     int i;
2700
2701     for(i=0; i<h; i++){
2702         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2710         dst+=dstStride;
2711         src+=srcStride;
2712     }
2713 }
2714
2715 #if CONFIG_CAVS_DECODER
2716 /* AVS specific */
2717 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718
2719 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720     put_pixels8_c(dst, src, stride, 8);
2721 }
2722 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723     avg_pixels8_c(dst, src, stride, 8);
2724 }
2725 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726     put_pixels16_c(dst, src, stride, 16);
2727 }
2728 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729     avg_pixels16_c(dst, src, stride, 16);
2730 }
2731 #endif /* CONFIG_CAVS_DECODER */
2732
2733 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2734 /* VC-1 specific */
2735 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736
2737 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738     put_pixels8_c(dst, src, stride, 8);
2739 }
2740 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741
2742 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743
2744 /* H264 specific */
2745 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746
2747 #if CONFIG_RV30_DECODER
2748 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749 #endif /* CONFIG_RV30_DECODER */
2750
2751 #if CONFIG_RV40_DECODER
2752 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753     put_pixels16_xy2_c(dst, src, stride, 16);
2754 }
2755 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756     avg_pixels16_xy2_c(dst, src, stride, 16);
2757 }
2758 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759     put_pixels8_xy2_c(dst, src, stride, 8);
2760 }
2761 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762     avg_pixels8_xy2_c(dst, src, stride, 8);
2763 }
2764
2765 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 #endif /* CONFIG_RV40_DECODER */
2767
2768 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2770     int i;
2771
2772     for(i=0; i<w; i++){
2773         const int src_1= src[ -srcStride];
2774         const int src0 = src[0          ];
2775         const int src1 = src[  srcStride];
2776         const int src2 = src[2*srcStride];
2777         const int src3 = src[3*srcStride];
2778         const int src4 = src[4*srcStride];
2779         const int src5 = src[5*srcStride];
2780         const int src6 = src[6*srcStride];
2781         const int src7 = src[7*srcStride];
2782         const int src8 = src[8*srcStride];
2783         const int src9 = src[9*srcStride];
2784         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2786         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2787         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2788         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2789         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2790         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2791         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2792         src++;
2793         dst++;
2794     }
2795 }
2796
2797 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798     put_pixels8_c(dst, src, stride, 8);
2799 }
2800
2801 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2802     uint8_t half[64];
2803     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2805 }
2806
2807 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2809 }
2810
2811 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2812     uint8_t half[64];
2813     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2815 }
2816
2817 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2819 }
2820
2821 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2822     uint8_t halfH[88];
2823     uint8_t halfV[64];
2824     uint8_t halfHV[64];
2825     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2829 }
2830 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2831     uint8_t halfH[88];
2832     uint8_t halfV[64];
2833     uint8_t halfHV[64];
2834     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2838 }
2839 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2840     uint8_t halfH[88];
2841     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2843 }
2844
2845 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846     if(CONFIG_ANY_H263) {
2847     int x;
2848     const int strength= ff_h263_loop_filter_strength[qscale];
2849
2850     for(x=0; x<8; x++){
2851         int d1, d2, ad1;
2852         int p0= src[x-2*stride];
2853         int p1= src[x-1*stride];
2854         int p2= src[x+0*stride];
2855         int p3= src[x+1*stride];
2856         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2857
2858         if     (d<-2*strength) d1= 0;
2859         else if(d<-  strength) d1=-2*strength - d;
2860         else if(d<   strength) d1= d;
2861         else if(d< 2*strength) d1= 2*strength - d;
2862         else                   d1= 0;
2863
2864         p1 += d1;
2865         p2 -= d1;
2866         if(p1&256) p1= ~(p1>>31);
2867         if(p2&256) p2= ~(p2>>31);
2868
2869         src[x-1*stride] = p1;
2870         src[x+0*stride] = p2;
2871
2872         ad1= FFABS(d1)>>1;
2873
2874         d2= av_clip((p0-p3)/4, -ad1, ad1);
2875
2876         src[x-2*stride] = p0 - d2;
2877         src[x+  stride] = p3 + d2;
2878     }
2879     }
2880 }
2881
2882 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883     if(CONFIG_ANY_H263) {
2884     int y;
2885     const int strength= ff_h263_loop_filter_strength[qscale];
2886
2887     for(y=0; y<8; y++){
2888         int d1, d2, ad1;
2889         int p0= src[y*stride-2];
2890         int p1= src[y*stride-1];
2891         int p2= src[y*stride+0];
2892         int p3= src[y*stride+1];
2893         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2894
2895         if     (d<-2*strength) d1= 0;
2896         else if(d<-  strength) d1=-2*strength - d;
2897         else if(d<   strength) d1= d;
2898         else if(d< 2*strength) d1= 2*strength - d;
2899         else                   d1= 0;
2900
2901         p1 += d1;
2902         p2 -= d1;
2903         if(p1&256) p1= ~(p1>>31);
2904         if(p2&256) p2= ~(p2>>31);
2905
2906         src[y*stride-1] = p1;
2907         src[y*stride+0] = p2;
2908
2909         ad1= FFABS(d1)>>1;
2910
2911         d2= av_clip((p0-p3)/4, -ad1, ad1);
2912
2913         src[y*stride-2] = p0 - d2;
2914         src[y*stride+1] = p3 + d2;
2915     }
2916     }
2917 }
2918
2919 static void h261_loop_filter_c(uint8_t *src, int stride){
2920     int x,y,xy,yz;
2921     int temp[64];
2922
2923     for(x=0; x<8; x++){
2924         temp[x      ] = 4*src[x           ];
2925         temp[x + 7*8] = 4*src[x + 7*stride];
2926     }
2927     for(y=1; y<7; y++){
2928         for(x=0; x<8; x++){
2929             xy = y * stride + x;
2930             yz = y * 8 + x;
2931             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2932         }
2933     }
2934
2935     for(y=0; y<8; y++){
2936         src[  y*stride] = (temp[  y*8] + 2)>>2;
2937         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2938         for(x=1; x<7; x++){
2939             xy = y * stride + x;
2940             yz = y * 8 + x;
2941             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2942         }
2943     }
2944 }
2945
2946 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2947 {
2948     int i, d;
2949     for( i = 0; i < 4; i++ ) {
2950         if( tc0[i] < 0 ) {
2951             pix += 4*ystride;
2952             continue;
2953         }
2954         for( d = 0; d < 4; d++ ) {
2955             const int p0 = pix[-1*xstride];
2956             const int p1 = pix[-2*xstride];
2957             const int p2 = pix[-3*xstride];
2958             const int q0 = pix[0];
2959             const int q1 = pix[1*xstride];
2960             const int q2 = pix[2*xstride];
2961
2962             if( FFABS( p0 - q0 ) < alpha &&
2963                 FFABS( p1 - p0 ) < beta &&
2964                 FFABS( q1 - q0 ) < beta ) {
2965
2966                 int tc = tc0[i];
2967                 int i_delta;
2968
2969                 if( FFABS( p2 - p0 ) < beta ) {
2970                     pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2971                     tc++;
2972                 }
2973                 if( FFABS( q2 - q0 ) < beta ) {
2974                     pix[   xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2975                     tc++;
2976                 }
2977
2978                 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979                 pix[-xstride] = av_clip_uint8( p0 + i_delta );    /* p0' */
2980                 pix[0]        = av_clip_uint8( q0 - i_delta );    /* q0' */
2981             }
2982             pix += ystride;
2983         }
2984     }
2985 }
2986 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987 {
2988     h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2989 }
2990 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2991 {
2992     h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2993 }
2994
2995 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2996 {
2997     int d;
2998     for( d = 0; d < 16; d++ ) {
2999         const int p2 = pix[-3*xstride];
3000         const int p1 = pix[-2*xstride];
3001         const int p0 = pix[-1*xstride];
3002
3003         const int q0 = pix[ 0*xstride];
3004         const int q1 = pix[ 1*xstride];
3005         const int q2 = pix[ 2*xstride];
3006
3007         if( FFABS( p0 - q0 ) < alpha &&
3008             FFABS( p1 - p0 ) < beta &&
3009             FFABS( q1 - q0 ) < beta ) {
3010
3011             if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012                 if( FFABS( p2 - p0 ) < beta)
3013                 {
3014                     const int p3 = pix[-4*xstride];
3015                     /* p0', p1', p2' */
3016                     pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017                     pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018                     pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3019                 } else {
3020                     /* p0' */
3021                     pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3022                 }
3023                 if( FFABS( q2 - q0 ) < beta)
3024                 {
3025                     const int q3 = pix[3*xstride];
3026                     /* q0', q1', q2' */
3027                     pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028                     pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029                     pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3030                 } else {
3031                     /* q0' */
3032                     pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3033                 }
3034             }else{
3035                 /* p0', q0' */
3036                 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037                 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3038             }
3039         }
3040         pix += ystride;
3041     }
3042 }
3043 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3044 {
3045     h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3046 }
3047 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3048 {
3049     h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3050 }
3051
3052 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3053 {
3054     int i, d;
3055     for( i = 0; i < 4; i++ ) {
3056         const int tc = tc0[i];
3057         if( tc <= 0 ) {
3058             pix += 2*ystride;
3059             continue;
3060         }
3061         for( d = 0; d < 2; d++ ) {
3062             const int p0 = pix[-1*xstride];
3063             const int p1 = pix[-2*xstride];
3064             const int q0 = pix[0];
3065             const int q1 = pix[1*xstride];
3066
3067             if( FFABS( p0 - q0 ) < alpha &&
3068                 FFABS( p1 - p0 ) < beta &&
3069                 FFABS( q1 - q0 ) < beta ) {
3070
3071                 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3072
3073                 pix[-xstride] = av_clip_uint8( p0 + delta );    /* p0' */
3074                 pix[0]        = av_clip_uint8( q0 - delta );    /* q0' */
3075             }
3076             pix += ystride;
3077         }
3078     }
3079 }
3080 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3081 {
3082     h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3083 }
3084 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3085 {
3086     h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3087 }
3088
3089 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3090 {
3091     int d;
3092     for( d = 0; d < 8; d++ ) {
3093         const int p0 = pix[-1*xstride];
3094         const int p1 = pix[-2*xstride];
3095         const int q0 = pix[0];
3096         const int q1 = pix[1*xstride];
3097
3098         if( FFABS( p0 - q0 ) < alpha &&
3099             FFABS( p1 - p0 ) < beta &&
3100             FFABS( q1 - q0 ) < beta ) {
3101
3102             pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
3103             pix[0]        = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
3104         }
3105         pix += ystride;
3106     }
3107 }
3108 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3109 {
3110     h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3111 }
3112 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3113 {
3114     h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3115 }
3116
3117 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3118 {
3119     int s, i;
3120
3121     s = 0;
3122     for(i=0;i<h;i++) {
3123         s += abs(pix1[0] - pix2[0]);
3124         s += abs(pix1[1] - pix2[1]);
3125         s += abs(pix1[2] - pix2[2]);
3126         s += abs(pix1[3] - pix2[3]);
3127         s += abs(pix1[4] - pix2[4]);
3128         s += abs(pix1[5] - pix2[5]);
3129         s += abs(pix1[6] - pix2[6]);
3130         s += abs(pix1[7] - pix2[7]);
3131         s += abs(pix1[8] - pix2[8]);
3132         s += abs(pix1[9] - pix2[9]);
3133         s += abs(pix1[10] - pix2[10]);
3134         s += abs(pix1[11] - pix2[11]);
3135         s += abs(pix1[12] - pix2[12]);
3136         s += abs(pix1[13] - pix2[13]);
3137         s += abs(pix1[14] - pix2[14]);
3138         s += abs(pix1[15] - pix2[15]);
3139         pix1 += line_size;
3140         pix2 += line_size;
3141     }
3142     return s;
3143 }
3144
3145 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3146 {
3147     int s, i;
3148
3149     s = 0;
3150     for(i=0;i<h;i++) {
3151         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3167         pix1 += line_size;
3168         pix2 += line_size;
3169     }
3170     return s;
3171 }
3172
3173 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3174 {
3175     int s, i;
3176     uint8_t *pix3 = pix2 + line_size;
3177
3178     s = 0;
3179     for(i=0;i<h;i++) {
3180         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3196         pix1 += line_size;
3197         pix2 += line_size;
3198         pix3 += line_size;
3199     }
3200     return s;
3201 }
3202
3203 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3204 {
3205     int s, i;
3206     uint8_t *pix3 = pix2 + line_size;
3207
3208     s = 0;
3209     for(i=0;i<h;i++) {
3210         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3226         pix1 += line_size;
3227         pix2 += line_size;
3228         pix3 += line_size;
3229     }
3230     return s;
3231 }
3232
3233 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234 {
3235     int s, i;
3236
3237     s = 0;
3238     for(i=0;i<h;i++) {
3239         s += abs(pix1[0] - pix2[0]);
3240         s += abs(pix1[1] - pix2[1]);
3241         s += abs(pix1[2] - pix2[2]);
3242         s += abs(pix1[3] - pix2[3]);
3243         s += abs(pix1[4] - pix2[4]);
3244         s += abs(pix1[5] - pix2[5]);
3245         s += abs(pix1[6] - pix2[6]);
3246         s += abs(pix1[7] - pix2[7]);
3247         pix1 += line_size;
3248         pix2 += line_size;
3249     }
3250     return s;
3251 }
3252
3253 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3254 {
3255     int s, i;
3256
3257     s = 0;
3258     for(i=0;i<h;i++) {
3259         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3267         pix1 += line_size;
3268         pix2 += line_size;
3269     }
3270     return s;
3271 }
3272
3273 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3274 {
3275     int s, i;
3276     uint8_t *pix3 = pix2 + line_size;
3277
3278     s = 0;
3279     for(i=0;i<h;i++) {
3280         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3288         pix1 += line_size;
3289         pix2 += line_size;
3290         pix3 += line_size;
3291     }
3292     return s;
3293 }
3294
3295 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3296 {
3297     int s, i;
3298     uint8_t *pix3 = pix2 + line_size;
3299
3300     s = 0;
3301     for(i=0;i<h;i++) {
3302         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3310         pix1 += line_size;
3311         pix2 += line_size;
3312         pix3 += line_size;
3313     }
3314     return s;
3315 }
3316
3317 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318     MpegEncContext *c = v;
3319     int score1=0;
3320     int score2=0;
3321     int x,y;
3322
3323     for(y=0; y<h; y++){
3324         for(x=0; x<16; x++){
3325             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3326         }
3327         if(y+1<h){
3328             for(x=0; x<15; x++){
3329                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3330                              - s1[x+1] + s1[x+1+stride])
3331                         -FFABS(  s2[x  ] - s2[x  +stride]
3332                              - s2[x+1] + s2[x+1+stride]);
3333             }
3334         }
3335         s1+= stride;
3336         s2+= stride;
3337     }
3338
3339     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340     else  return score1 + FFABS(score2)*8;
3341 }
3342
3343 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344     MpegEncContext *c = v;
3345     int score1=0;
3346     int score2=0;
3347     int x,y;
3348
3349     for(y=0; y<h; y++){
3350         for(x=0; x<8; x++){
3351             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3352         }
3353         if(y+1<h){
3354             for(x=0; x<7; x++){
3355                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3356                              - s1[x+1] + s1[x+1+stride])
3357                         -FFABS(  s2[x  ] - s2[x  +stride]
3358                              - s2[x+1] + s2[x+1+stride]);
3359             }
3360         }
3361         s1+= stride;
3362         s2+= stride;
3363     }
3364
3365     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366     else  return score1 + FFABS(score2)*8;
3367 }
3368
3369 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3370     int i;
3371     unsigned int sum=0;
3372
3373     for(i=0; i<8*8; i++){
3374         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3375         int w= weight[i];
3376         b>>= RECON_SHIFT;
3377         assert(-512<b && b<512);
3378
3379         sum += (w*b)*(w*b)>>4;
3380     }
3381     return sum>>2;
3382 }
3383
3384 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3385     int i;
3386
3387     for(i=0; i<8*8; i++){
3388         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3389     }
3390 }
3391
3392 /**
3393  * permutes an 8x8 block.
3394  * @param block the block which will be permuted according to the given permutation vector
3395  * @param permutation the permutation vector
3396  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398  *                  (inverse) permutated to scantable order!
3399  */
3400 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3401 {
3402     int i;
3403     DCTELEM temp[64];
3404
3405     if(last<=0) return;
3406     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3407
3408     for(i=0; i<=last; i++){
3409         const int j= scantable[i];
3410         temp[j]= block[j];
3411         block[j]=0;
3412     }
3413
3414     for(i=0; i<=last; i++){
3415         const int j= scantable[i];
3416         const int perm_j= permutation[j];
3417         block[perm_j]= temp[j];
3418     }
3419 }
3420
3421 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3422     return 0;
3423 }
3424
3425 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3426     int i;
3427
3428     memset(cmp, 0, sizeof(void*)*6);
3429
3430     for(i=0; i<6; i++){
3431         switch(type&0xFF){
3432         case FF_CMP_SAD:
3433             cmp[i]= c->sad[i];
3434             break;
3435         case FF_CMP_SATD:
3436             cmp[i]= c->hadamard8_diff[i];
3437             break;
3438         case FF_CMP_SSE:
3439             cmp[i]= c->sse[i];
3440             break;
3441         case FF_CMP_DCT:
3442             cmp[i]= c->dct_sad[i];
3443             break;
3444         case FF_CMP_DCT264:
3445             cmp[i]= c->dct264_sad[i];
3446             break;
3447         case FF_CMP_DCTMAX:
3448             cmp[i]= c->dct_max[i];
3449             break;
3450         case FF_CMP_PSNR:
3451             cmp[i]= c->quant_psnr[i];
3452             break;
3453         case FF_CMP_BIT:
3454             cmp[i]= c->bit[i];
3455             break;
3456         case FF_CMP_RD:
3457             cmp[i]= c->rd[i];
3458             break;
3459         case FF_CMP_VSAD:
3460             cmp[i]= c->vsad[i];
3461             break;
3462         case FF_CMP_VSSE:
3463             cmp[i]= c->vsse[i];
3464             break;
3465         case FF_CMP_ZERO:
3466             cmp[i]= zero_cmp;
3467             break;
3468         case FF_CMP_NSSE:
3469             cmp[i]= c->nsse[i];
3470             break;
3471 #if CONFIG_SNOW_ENCODER
3472         case FF_CMP_W53:
3473             cmp[i]= c->w53[i];
3474             break;
3475         case FF_CMP_W97:
3476             cmp[i]= c->w97[i];
3477             break;
3478 #endif
3479         default:
3480             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3481         }
3482     }
3483 }
3484
3485 static void clear_block_c(DCTELEM *block)
3486 {
3487     memset(block, 0, sizeof(DCTELEM)*64);
3488 }
3489
3490 /**
3491  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3492  */
3493 static void clear_blocks_c(DCTELEM *blocks)
3494 {
3495     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3496 }
3497
3498 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3499     long i;
3500     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3501         long a = *(long*)(src+i);
3502         long b = *(long*)(dst+i);
3503         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3504     }
3505     for(; i<w; i++)
3506         dst[i+0] += src[i+0];
3507 }
3508
3509 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3510     long i;
3511     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3512         long a = *(long*)(src1+i);
3513         long b = *(long*)(src2+i);
3514         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3515     }
3516     for(; i<w; i++)
3517         dst[i] = src1[i]+src2[i];
3518 }
3519
3520 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3521     long i;
3522 #if !HAVE_FAST_UNALIGNED
3523     if((long)src2 & (sizeof(long)-1)){
3524         for(i=0; i+7<w; i+=8){
3525             dst[i+0] = src1[i+0]-src2[i+0];
3526             dst[i+1] = src1[i+1]-src2[i+1];
3527             dst[i+2] = src1[i+2]-src2[i+2];
3528             dst[i+3] = src1[i+3]-src2[i+3];
3529             dst[i+4] = src1[i+4]-src2[i+4];
3530             dst[i+5] = src1[i+5]-src2[i+5];
3531             dst[i+6] = src1[i+6]-src2[i+6];
3532             dst[i+7] = src1[i+7]-src2[i+7];
3533         }
3534     }else
3535 #endif
3536     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3537         long a = *(long*)(src1+i);
3538         long b = *(long*)(src2+i);
3539         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3540     }
3541     for(; i<w; i++)
3542         dst[i+0] = src1[i+0]-src2[i+0];
3543 }
3544
3545 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3546     int i;
3547     uint8_t l, lt;
3548
3549     l= *left;
3550     lt= *left_top;
3551
3552     for(i=0; i<w; i++){
3553         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3554         lt= src1[i];
3555         dst[i]= l;
3556     }
3557
3558     *left= l;
3559     *left_top= lt;
3560 }
3561
3562 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3563     int i;
3564     uint8_t l, lt;
3565
3566     l= *left;
3567     lt= *left_top;
3568
3569     for(i=0; i<w; i++){
3570         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3571         lt= src1[i];
3572         l= src2[i];
3573         dst[i]= l - pred;
3574     }
3575
3576     *left= l;
3577     *left_top= lt;
3578 }
3579
3580 #define BUTTERFLY2(o1,o2,i1,i2) \
3581 o1= (i1)+(i2);\
3582 o2= (i1)-(i2);
3583
3584 #define BUTTERFLY1(x,y) \
3585 {\
3586     int a,b;\
3587     a= x;\
3588     b= y;\
3589     x= a+b;\
3590     y= a-b;\
3591 }
3592
3593 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3594
3595 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3596     int i;
3597     int temp[64];
3598     int sum=0;
3599
3600     assert(h==8);
3601
3602     for(i=0; i<8; i++){
3603         //FIXME try pointer walks
3604         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3605         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3606         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3607         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3608
3609         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3610         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3611         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3612         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3613
3614         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3615         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3616         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3617         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3618     }
3619
3620     for(i=0; i<8; i++){
3621         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3622         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3623         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3624         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3625
3626         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3627         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3628         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3629         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3630
3631         sum +=
3632              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3633             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3634             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3635             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3636     }
3637 #if 0
3638 static int maxi=0;
3639 if(sum>maxi){
3640     maxi=sum;
3641     printf("MAX:%d\n", maxi);
3642 }
3643 #endif
3644     return sum;
3645 }
3646
3647 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3648     int i;
3649     int temp[64];
3650     int sum=0;
3651
3652     assert(h==8);
3653
3654     for(i=0; i<8; i++){
3655         //FIXME try pointer walks
3656         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3657         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3658         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3659         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3660
3661         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3662         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3663         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3664         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3665
3666         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3667         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3668         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3669         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3670     }
3671
3672     for(i=0; i<8; i++){
3673         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3674         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3675         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3676         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3677
3678         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3679         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3680         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3681         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3682
3683         sum +=
3684              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3685             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3686             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3687             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3688     }
3689
3690     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3691
3692     return sum;
3693 }
3694
3695 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3696     MpegEncContext * const s= (MpegEncContext *)c;
3697     DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3698     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3699
3700     assert(h==8);
3701
3702     s->dsp.diff_pixels(temp, src1, src2, stride);
3703     s->dsp.fdct(temp);
3704     return s->dsp.sum_abs_dctelem(temp);
3705 }
3706
3707 #if CONFIG_GPL
3708 #define DCT8_1D {\
3709     const int s07 = SRC(0) + SRC(7);\
3710     const int s16 = SRC(1) + SRC(6);\
3711     const int s25 = SRC(2) + SRC(5);\
3712     const int s34 = SRC(3) + SRC(4);\
3713     const int a0 = s07 + s34;\
3714     const int a1 = s16 + s25;\
3715     const int a2 = s07 - s34;\
3716     const int a3 = s16 - s25;\
3717     const int d07 = SRC(0) - SRC(7);\
3718     const int d16 = SRC(1) - SRC(6);\
3719     const int d25 = SRC(2) - SRC(5);\
3720     const int d34 = SRC(3) - SRC(4);\
3721     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3722     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3723     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3724     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3725     DST(0,  a0 + a1     ) ;\
3726     DST(1,  a4 + (a7>>2)) ;\
3727     DST(2,  a2 + (a3>>1)) ;\
3728     DST(3,  a5 + (a6>>2)) ;\
3729     DST(4,  a0 - a1     ) ;\
3730     DST(5,  a6 - (a5>>2)) ;\
3731     DST(6, (a2>>1) - a3 ) ;\
3732     DST(7, (a4>>2) - a7 ) ;\
3733 }
3734
3735 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3736     MpegEncContext * const s= (MpegEncContext *)c;
3737     DCTELEM dct[8][8];
3738     int i;
3739     int sum=0;
3740
3741     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3742
3743 #define SRC(x) dct[i][x]
3744 #define DST(x,v) dct[i][x]= v
3745     for( i = 0; i < 8; i++ )
3746         DCT8_1D
3747 #undef SRC
3748 #undef DST
3749
3750 #define SRC(x) dct[x][i]
3751 #define DST(x,v) sum += FFABS(v)
3752     for( i = 0; i < 8; i++ )
3753         DCT8_1D
3754 #undef SRC
3755 #undef DST
3756     return sum;
3757 }
3758 #endif
3759
3760 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761     MpegEncContext * const s= (MpegEncContext *)c;
3762     DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3763     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3764     int sum=0, i;
3765
3766     assert(h==8);
3767
3768     s->dsp.diff_pixels(temp, src1, src2, stride);
3769     s->dsp.fdct(temp);
3770
3771     for(i=0; i<64; i++)
3772         sum= FFMAX(sum, FFABS(temp[i]));
3773
3774     return sum;
3775 }
3776
3777 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3778     MpegEncContext * const s= (MpegEncContext *)c;
3779     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3780     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781     DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3782     int sum=0, i;
3783
3784     assert(h==8);
3785     s->mb_intra=0;
3786
3787     s->dsp.diff_pixels(temp, src1, src2, stride);
3788
3789     memcpy(bak, temp, 64*sizeof(DCTELEM));
3790
3791     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3792     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3793     ff_simple_idct(temp); //FIXME
3794
3795     for(i=0; i<64; i++)
3796         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3797
3798     return sum;
3799 }
3800
3801 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802     MpegEncContext * const s= (MpegEncContext *)c;
3803     const uint8_t *scantable= s->intra_scantable.permutated;
3804     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805     DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3806     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3807     uint8_t * const bak= (uint8_t*)aligned_bak;
3808     int i, last, run, bits, level, distortion, start_i;
3809     const int esc_length= s->ac_esc_length;
3810     uint8_t * length;
3811     uint8_t * last_length;
3812
3813     assert(h==8);
3814
3815     for(i=0; i<8; i++){
3816         ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3817         ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3818     }
3819
3820     s->dsp.diff_pixels(temp, src1, src2, stride);
3821
3822     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3823
3824     bits=0;
3825
3826     if (s->mb_intra) {
3827         start_i = 1;
3828         length     = s->intra_ac_vlc_length;
3829         last_length= s->intra_ac_vlc_last_length;
3830         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3831     } else {
3832         start_i = 0;
3833         length     = s->inter_ac_vlc_length;
3834         last_length= s->inter_ac_vlc_last_length;
3835     }
3836
3837     if(last>=start_i){
3838         run=0;
3839         for(i=start_i; i<last; i++){
3840             int j= scantable[i];
3841             level= temp[j];
3842
3843             if(level){
3844                 level+=64;
3845                 if((level&(~127)) == 0){
3846                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3847                 }else
3848                     bits+= esc_length;
3849                 run=0;
3850             }else
3851                 run++;
3852         }
3853         i= scantable[last];
3854
3855         level= temp[i] + 64;
3856
3857         assert(level - 64);
3858
3859         if((level&(~127)) == 0){
3860             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3861         }else
3862             bits+= esc_length;
3863
3864     }
3865
3866     if(last>=0){
3867         if(s->mb_intra)
3868             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3869         else
3870             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3871     }
3872
3873     s->dsp.idct_add(bak, stride, temp);
3874
3875     distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3876
3877     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3878 }
3879
3880 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3881     MpegEncContext * const s= (MpegEncContext *)c;
3882     const uint8_t *scantable= s->intra_scantable.permutated;
3883     DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3884     DCTELEM * const temp= (DCTELEM*)aligned_temp;
3885     int i, last, run, bits, level, start_i;
3886     const int esc_length= s->ac_esc_length;
3887     uint8_t * length;
3888     uint8_t * last_length;
3889
3890     assert(h==8);
3891
3892     s->dsp.diff_pixels(temp, src1, src2, stride);
3893
3894     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3895
3896     bits=0;
3897
3898     if (s->mb_intra) {
3899         start_i = 1;
3900         length     = s->intra_ac_vlc_length;
3901         last_length= s->intra_ac_vlc_last_length;
3902         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3903     } else {
3904         start_i = 0;
3905         length     = s->inter_ac_vlc_length;
3906         last_length= s->inter_ac_vlc_last_length;
3907     }
3908
3909     if(last>=start_i){
3910         run=0;
3911         for(i=start_i; i<last; i++){
3912             int j= scantable[i];
3913             level= temp[j];
3914
3915             if(level){
3916                 level+=64;
3917                 if((level&(~127)) == 0){
3918                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3919                 }else
3920                     bits+= esc_length;
3921                 run=0;
3922             }else
3923                 run++;
3924         }
3925         i= scantable[last];
3926
3927         level= temp[i] + 64;
3928
3929         assert(level - 64);
3930
3931         if((level&(~127)) == 0){
3932             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3933         }else
3934             bits+= esc_length;
3935     }
3936
3937     return bits;
3938 }
3939
3940 #define VSAD_INTRA(size) \
3941 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3942     int score=0;                                                                                            \
3943     int x,y;                                                                                                \
3944                                                                                                             \
3945     for(y=1; y<h; y++){                                                                                     \
3946         for(x=0; x<size; x+=4){                                                                             \
3947             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3948                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3949         }                                                                                                   \
3950         s+= stride;                                                                                         \
3951     }                                                                                                       \
3952                                                                                                             \
3953     return score;                                                                                           \
3954 }
3955 VSAD_INTRA(8)
3956 VSAD_INTRA(16)
3957
3958 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3959     int score=0;
3960     int x,y;
3961
3962     for(y=1; y<h; y++){
3963         for(x=0; x<16; x++){
3964             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3965         }
3966         s1+= stride;
3967         s2+= stride;
3968     }
3969
3970     return score;
3971 }
3972
3973 #define SQ(a) ((a)*(a))
3974 #define VSSE_INTRA(size) \
3975 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3976     int score=0;                                                                                            \
3977     int x,y;                                                                                                \
3978                                                                                                             \
3979     for(y=1; y<h; y++){                                                                                     \
3980         for(x=0; x<size; x+=4){                                                                               \
3981             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3982                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3983         }                                                                                                   \
3984         s+= stride;                                                                                         \
3985     }                                                                                                       \
3986                                                                                                             \
3987     return score;                                                                                           \
3988 }
3989 VSSE_INTRA(8)
3990 VSSE_INTRA(16)
3991
3992 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3993     int score=0;
3994     int x,y;
3995
3996     for(y=1; y<h; y++){
3997         for(x=0; x<16; x++){
3998             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3999         }
4000         s1+= stride;
4001         s2+= stride;
4002     }
4003
4004     return score;
4005 }
4006
4007 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4008                                int size){
4009     int score=0;
4010     int i;
4011     for(i=0; i<size; i++)
4012         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4013     return score;
4014 }
4015
4016 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4017 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4018 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4019 #if CONFIG_GPL
4020 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4021 #endif
4022 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4023 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4024 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4025 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4026
4027 static void vector_fmul_c(float *dst, const float *src, int len){
4028     int i;
4029     for(i=0; i<len; i++)
4030         dst[i] *= src[i];
4031 }
4032
4033 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4034     int i;
4035     src1 += len-1;
4036     for(i=0; i<len; i++)
4037         dst[i] = src0[i] * src1[-i];
4038 }
4039
4040 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4041     int i;
4042     for(i=0; i<len; i++)
4043         dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4044 }
4045
4046 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4047     int i,j;
4048     dst += len;
4049     win += len;
4050     src0+= len;
4051     for(i=-len, j=len-1; i<0; i++, j--) {
4052         float s0 = src0[i];
4053         float s1 = src1[j];
4054         float wi = win[i];
4055         float wj = win[j];
4056         dst[i] = s0*wj - s1*wi + add_bias;
4057         dst[j] = s0*wi + s1*wj + add_bias;
4058     }
4059 }
4060
4061 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4062     int i;
4063     for(i=0; i<len; i++)
4064         dst[i] = src[i] * mul;
4065 }
4066
4067 static av_always_inline int float_to_int16_one(const float *src){
4068     int_fast32_t tmp = *(const int32_t*)src;
4069     if(tmp & 0xf0000){
4070         tmp = (0x43c0ffff - tmp)>>31;
4071         // is this faster on some gcc/cpu combinations?
4072 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4073 //      else                 tmp = 0;
4074     }
4075     return tmp - 0x8000;
4076 }
4077
4078 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4079     int i;
4080     for(i=0; i<len; i++)
4081         dst[i] = float_to_int16_one(src+i);
4082 }
4083
4084 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4085     int i,j,c;
4086     if(channels==2){
4087         for(i=0; i<len; i++){
4088             dst[2*i]   = float_to_int16_one(src[0]+i);
4089             dst[2*i+1] = float_to_int16_one(src[1]+i);
4090         }
4091     }else{
4092         for(c=0; c<channels; c++)
4093             for(i=0, j=c; i<len; i++, j+=channels)
4094                 dst[j] = float_to_int16_one(src[c]+i);
4095     }
4096 }
4097
4098 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4099 {
4100     while (order--)
4101        *v1++ += *v2++;
4102 }
4103
4104 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4105 {
4106     while (order--)
4107         *v1++ -= *v2++;
4108 }
4109
4110 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4111 {
4112     int res = 0;
4113
4114     while (order--)
4115         res += (*v1++ * *v2++) >> shift;
4116
4117     return res;
4118 }
4119
4120 #define W0 2048
4121 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4122 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4123 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4124 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4125 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4126 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4127 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4128
4129 static void wmv2_idct_row(short * b)
4130 {
4131     int s1,s2;
4132     int a0,a1,a2,a3,a4,a5,a6,a7;
4133     /*step 1*/
4134     a1 = W1*b[1]+W7*b[7];
4135     a7 = W7*b[1]-W1*b[7];
4136     a5 = W5*b[5]+W3*b[3];
4137     a3 = W3*b[5]-W5*b[3];
4138     a2 = W2*b[2]+W6*b[6];
4139     a6 = W6*b[2]-W2*b[6];
4140     a0 = W0*b[0]+W0*b[4];
4141     a4 = W0*b[0]-W0*b[4];
4142     /*step 2*/
4143     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4144     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4145     /*step 3*/
4146     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4147     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4148     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4149     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4150     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4151     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4152     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4153     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4154 }
4155 static void wmv2_idct_col(short * b)
4156 {
4157     int s1,s2;
4158     int a0,a1,a2,a3,a4,a5,a6,a7;
4159     /*step 1, with extended precision*/
4160     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4161     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4162     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4163     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4164     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4165     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4166     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4167     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4168     /*step 2*/
4169     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4170     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4171     /*step 3*/
4172     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4173     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4174     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4175     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4176
4177     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4178     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4179     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4180     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4181 }
4182 void ff_wmv2_idct_c(short * block){
4183     int i;
4184
4185     for(i=0;i<64;i+=8){
4186         wmv2_idct_row(block+i);
4187     }
4188     for(i=0;i<8;i++){
4189         wmv2_idct_col(block+i);
4190     }
4191 }
4192 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4193  converted */
4194 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4195 {
4196     ff_wmv2_idct_c(block);
4197     put_pixels_clamped_c(block, dest, line_size);
4198 }
4199 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4200 {
4201     ff_wmv2_idct_c(block);
4202     add_pixels_clamped_c(block, dest, line_size);
4203 }
4204 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4205 {
4206     j_rev_dct (block);
4207     put_pixels_clamped_c(block, dest, line_size);
4208 }
4209 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4210 {
4211     j_rev_dct (block);
4212     add_pixels_clamped_c(block, dest, line_size);
4213 }
4214
4215 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4216 {
4217     j_rev_dct4 (block);
4218     put_pixels_clamped4_c(block, dest, line_size);
4219 }
4220 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4221 {
4222     j_rev_dct4 (block);
4223     add_pixels_clamped4_c(block, dest, line_size);
4224 }
4225
4226 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4227 {
4228     j_rev_dct2 (block);
4229     put_pixels_clamped2_c(block, dest, line_size);
4230 }
4231 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4232 {
4233     j_rev_dct2 (block);
4234     add_pixels_clamped2_c(block, dest, line_size);
4235 }
4236
4237 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4238 {
4239     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4240
4241     dest[0] = cm[(block[0] + 4)>>3];
4242 }
4243 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4244 {
4245     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4246
4247     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4248 }
4249
4250 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4251
4252 /* init static data */
4253 void dsputil_static_init(void)
4254 {
4255     int i;
4256
4257     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4258     for(i=0;i<MAX_NEG_CROP;i++) {
4259         ff_cropTbl[i] = 0;
4260         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4261     }
4262
4263     for(i=0;i<512;i++) {
4264         ff_squareTbl[i] = (i - 256) * (i - 256);
4265     }
4266
4267     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4268 }
4269
4270 int ff_check_alignment(void){
4271     static int did_fail=0;
4272     DECLARE_ALIGNED_16(int, aligned);
4273
4274     if((long)&aligned & 15){
4275         if(!did_fail){
4276 #if HAVE_MMX || HAVE_ALTIVEC
4277             av_log(NULL, AV_LOG_ERROR,
4278                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4279                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4280                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4281                 "Do not report crashes to FFmpeg developers.\n");
4282 #endif
4283             did_fail=1;
4284         }
4285         return -1;
4286     }
4287     return 0;
4288 }
4289
4290 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4291 {
4292     int i;
4293
4294     ff_check_alignment();
4295
4296 #if CONFIG_ENCODERS
4297     if(avctx->dct_algo==FF_DCT_FASTINT) {
4298         c->fdct = fdct_ifast;
4299         c->fdct248 = fdct_ifast248;
4300     }
4301     else if(avctx->dct_algo==FF_DCT_FAAN) {
4302         c->fdct = ff_faandct;
4303         c->fdct248 = ff_faandct248;
4304     }
4305     else {
4306         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4307         c->fdct248 = ff_fdct248_islow;
4308     }
4309 #endif //CONFIG_ENCODERS
4310
4311     if(avctx->lowres==1){
4312         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4313             c->idct_put= ff_jref_idct4_put;
4314             c->idct_add= ff_jref_idct4_add;
4315         }else{
4316             c->idct_put= ff_h264_lowres_idct_put_c;
4317             c->idct_add= ff_h264_lowres_idct_add_c;
4318         }
4319         c->idct    = j_rev_dct4;
4320         c->idct_permutation_type= FF_NO_IDCT_PERM;
4321     }else if(avctx->lowres==2){
4322         c->idct_put= ff_jref_idct2_put;
4323         c->idct_add= ff_jref_idct2_add;
4324         c->idct    = j_rev_dct2;
4325         c->idct_permutation_type= FF_NO_IDCT_PERM;
4326     }else if(avctx->lowres==3){
4327         c->idct_put= ff_jref_idct1_put;
4328         c->idct_add= ff_jref_idct1_add;
4329         c->idct    = j_rev_dct1;
4330         c->idct_permutation_type= FF_NO_IDCT_PERM;
4331     }else{
4332         if(avctx->idct_algo==FF_IDCT_INT){
4333             c->idct_put= ff_jref_idct_put;
4334             c->idct_add= ff_jref_idct_add;
4335             c->idct    = j_rev_dct;
4336             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4337         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
4338                 avctx->idct_algo==FF_IDCT_VP3){
4339             c->idct_put= ff_vp3_idct_put_c;
4340             c->idct_add= ff_vp3_idct_add_c;
4341             c->idct    = ff_vp3_idct_c;
4342             c->idct_permutation_type= FF_NO_IDCT_PERM;
4343         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4344             c->idct_put= ff_wmv2_idct_put_c;
4345             c->idct_add= ff_wmv2_idct_add_c;
4346             c->idct    = ff_wmv2_idct_c;
4347             c->idct_permutation_type= FF_NO_IDCT_PERM;
4348         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4349             c->idct_put= ff_faanidct_put;
4350             c->idct_add= ff_faanidct_add;
4351             c->idct    = ff_faanidct;
4352             c->idct_permutation_type= FF_NO_IDCT_PERM;
4353         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4354             c->idct_put= ff_ea_idct_put_c;
4355             c->idct_permutation_type= FF_NO_IDCT_PERM;
4356         }else{ //accurate/default
4357             c->idct_put= ff_simple_idct_put;
4358             c->idct_add= ff_simple_idct_add;
4359             c->idct    = ff_simple_idct;
4360             c->idct_permutation_type= FF_NO_IDCT_PERM;
4361         }
4362     }
4363
4364     if (CONFIG_H264_DECODER) {
4365         c->h264_idct_add= ff_h264_idct_add_c;
4366         c->h264_idct8_add= ff_h264_idct8_add_c;
4367         c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4368         c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4369         c->h264_idct_add16     = ff_h264_idct_add16_c;
4370         c->h264_idct8_add4     = ff_h264_idct8_add4_c;
4371         c->h264_idct_add8      = ff_h264_idct_add8_c;
4372         c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4373     }
4374
4375     c->get_pixels = get_pixels_c;
4376     c->diff_pixels = diff_pixels_c;
4377     c->put_pixels_clamped = put_pixels_clamped_c;
4378     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4379     c->add_pixels_clamped = add_pixels_clamped_c;
4380     c->add_pixels8 = add_pixels8_c;
4381     c->add_pixels4 = add_pixels4_c;
4382     c->sum_abs_dctelem = sum_abs_dctelem_c;
4383     c->gmc1 = gmc1_c;
4384     c->gmc = ff_gmc_c;
4385     c->clear_block = clear_block_c;
4386     c->clear_blocks = clear_blocks_c;
4387     c->pix_sum = pix_sum_c;
4388     c->pix_norm1 = pix_norm1_c;
4389
4390     /* TODO [0] 16  [1] 8 */
4391     c->pix_abs[0][0] = pix_abs16_c;
4392     c->pix_abs[0][1] = pix_abs16_x2_c;
4393     c->pix_abs[0][2] = pix_abs16_y2_c;
4394     c->pix_abs[0][3] = pix_abs16_xy2_c;
4395     c->pix_abs[1][0] = pix_abs8_c;
4396     c->pix_abs[1][1] = pix_abs8_x2_c;
4397     c->pix_abs[1][2] = pix_abs8_y2_c;
4398     c->pix_abs[1][3] = pix_abs8_xy2_c;
4399
4400 #define dspfunc(PFX, IDX, NUM) \
4401     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4402     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4403     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4404     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4405
4406     dspfunc(put, 0, 16);
4407     dspfunc(put_no_rnd, 0, 16);
4408     dspfunc(put, 1, 8);
4409     dspfunc(put_no_rnd, 1, 8);
4410     dspfunc(put, 2, 4);
4411     dspfunc(put, 3, 2);
4412
4413     dspfunc(avg, 0, 16);
4414     dspfunc(avg_no_rnd, 0, 16);
4415     dspfunc(avg, 1, 8);
4416     dspfunc(avg_no_rnd, 1, 8);
4417     dspfunc(avg, 2, 4);
4418     dspfunc(avg, 3, 2);
4419 #undef dspfunc
4420
4421     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4422     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4423
4424     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4425     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4426     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4427     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4428     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4429     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4430     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4431     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4432     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4433
4434     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4435     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4436     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4437     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4438     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4439     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4440     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4441     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4442     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4443
4444 #define dspfunc(PFX, IDX, NUM) \
4445     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4446     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4447     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4448     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4449     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4450     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4451     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4452     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4453     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4454     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4455     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4456     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4457     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4458     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4459     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4460     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4461
4462     dspfunc(put_qpel, 0, 16);
4463     dspfunc(put_no_rnd_qpel, 0, 16);
4464
4465     dspfunc(avg_qpel, 0, 16);
4466     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4467
4468     dspfunc(put_qpel, 1, 8);
4469     dspfunc(put_no_rnd_qpel, 1, 8);
4470
4471     dspfunc(avg_qpel, 1, 8);
4472     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4473
4474     dspfunc(put_h264_qpel, 0, 16);
4475     dspfunc(put_h264_qpel, 1, 8);
4476     dspfunc(put_h264_qpel, 2, 4);
4477     dspfunc(put_h264_qpel, 3, 2);
4478     dspfunc(avg_h264_qpel, 0, 16);
4479     dspfunc(avg_h264_qpel, 1, 8);
4480     dspfunc(avg_h264_qpel, 2, 4);
4481
4482 #undef dspfunc
4483     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4484     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4485     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4486     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4487     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4488     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4489     c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4490
4491     c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4492     c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4493     c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4494     c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4495     c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4496     c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4497     c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4498     c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4499     c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4500     c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4501     c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4502     c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4503     c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4504     c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4505     c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4506     c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4507     c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4508     c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4509     c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4510     c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4511
4512     c->draw_edges = draw_edges_c;
4513
4514 #if CONFIG_CAVS_DECODER
4515     ff_cavsdsp_init(c,avctx);
4516 #endif
4517 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4518     ff_vc1dsp_init(c,avctx);
4519 #endif
4520 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4521     ff_intrax8dsp_init(c,avctx);
4522 #endif
4523 #if CONFIG_RV30_DECODER
4524     ff_rv30dsp_init(c,avctx);
4525 #endif
4526 #if CONFIG_RV40_DECODER
4527     ff_rv40dsp_init(c,avctx);
4528     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4529     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4530     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4531     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4532 #endif
4533
4534     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4535     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4536     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4537     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4538     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4539     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4540     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4541     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4542
4543 #define SET_CMP_FUNC(name) \
4544     c->name[0]= name ## 16_c;\
4545     c->name[1]= name ## 8x8_c;
4546
4547     SET_CMP_FUNC(hadamard8_diff)
4548     c->hadamard8_diff[4]= hadamard8_intra16_c;
4549     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4550     SET_CMP_FUNC(dct_sad)
4551     SET_CMP_FUNC(dct_max)
4552 #if CONFIG_GPL
4553     SET_CMP_FUNC(dct264_sad)
4554 #endif
4555     c->sad[0]= pix_abs16_c;
4556     c->sad[1]= pix_abs8_c;
4557     c->sse[0]= sse16_c;
4558     c->sse[1]= sse8_c;
4559     c->sse[2]= sse4_c;
4560     SET_CMP_FUNC(quant_psnr)
4561     SET_CMP_FUNC(rd)
4562     SET_CMP_FUNC(bit)
4563     c->vsad[0]= vsad16_c;
4564     c->vsad[4]= vsad_intra16_c;
4565     c->vsad[5]= vsad_intra8_c;
4566     c->vsse[0]= vsse16_c;
4567     c->vsse[4]= vsse_intra16_c;
4568     c->vsse[5]= vsse_intra8_c;
4569     c->nsse[0]= nsse16_c;
4570     c->nsse[1]= nsse8_c;
4571 #if CONFIG_SNOW_ENCODER
4572     c->w53[0]= w53_16_c;
4573     c->w53[1]= w53_8_c;
4574     c->w97[0]= w97_16_c;
4575     c->w97[1]= w97_8_c;
4576 #endif
4577
4578     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4579
4580     c->add_bytes= add_bytes_c;
4581     c->add_bytes_l2= add_bytes_l2_c;
4582     c->diff_bytes= diff_bytes_c;
4583     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4584     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4585     c->bswap_buf= bswap_buf;
4586 #if CONFIG_PNG_DECODER
4587     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4588 #endif
4589
4590     c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4591     c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4592     c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4593     c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4594     c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4595     c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4596     c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4597     c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4598     c->h264_loop_filter_strength= NULL;
4599
4600     if (CONFIG_ANY_H263) {
4601         c->h263_h_loop_filter= h263_h_loop_filter_c;
4602         c->h263_v_loop_filter= h263_v_loop_filter_c;
4603     }
4604
4605     if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
4606         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4607         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4608     }
4609     if (CONFIG_VP6_DECODER) {
4610         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4611     }
4612
4613     c->h261_loop_filter= h261_loop_filter_c;
4614
4615     c->try_8x8basis= try_8x8basis_c;
4616     c->add_8x8basis= add_8x8basis_c;
4617
4618 #if CONFIG_SNOW_DECODER
4619     c->vertical_compose97i = ff_snow_vertical_compose97i;
4620     c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4621     c->inner_add_yblock = ff_snow_inner_add_yblock;
4622 #endif
4623
4624 #if CONFIG_VORBIS_DECODER
4625     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4626 #endif
4627 #if CONFIG_AC3_DECODER
4628     c->ac3_downmix = ff_ac3_downmix_c;
4629 #endif
4630 #if CONFIG_FLAC_ENCODER
4631     c->flac_compute_autocorr = ff_flac_compute_autocorr;
4632 #endif
4633     c->vector_fmul = vector_fmul_c;
4634     c->vector_fmul_reverse = vector_fmul_reverse_c;
4635     c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4636     c->vector_fmul_window = ff_vector_fmul_window_c;
4637     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4638     c->float_to_int16 = ff_float_to_int16_c;
4639     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4640     c->add_int16 = add_int16_c;
4641     c->sub_int16 = sub_int16_c;
4642     c->scalarproduct_int16 = scalarproduct_int16_c;
4643
4644     c->shrink[0]= ff_img_copy_plane;
4645     c->shrink[1]= ff_shrink22;
4646     c->shrink[2]= ff_shrink44;
4647     c->shrink[3]= ff_shrink88;
4648
4649     c->prefetch= just_return;
4650
4651     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4652     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4653
4654     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4655     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4656     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4657     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4658     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4659     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4660     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4661     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4662     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4663
4664     for(i=0; i<64; i++){
4665         if(!c->put_2tap_qpel_pixels_tab[0][i])
4666             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4667         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4668             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4669     }
4670
4671     switch(c->idct_permutation_type){
4672     case FF_NO_IDCT_PERM:
4673         for(i=0; i<64; i++)
4674             c->idct_permutation[i]= i;
4675         break;
4676     case FF_LIBMPEG2_IDCT_PERM:
4677         for(i=0; i<64; i++)
4678             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4679         break;
4680     case FF_SIMPLE_IDCT_PERM:
4681         for(i=0; i<64; i++)
4682             c->idct_permutation[i]= simple_mmx_permutation[i];
4683         break;
4684     case FF_TRANSPOSE_IDCT_PERM:
4685         for(i=0; i<64; i++)
4686             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4687         break;
4688     case FF_PARTTRANS_IDCT_PERM:
4689         for(i=0; i<64; i++)
4690             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4691         break;
4692     case FF_SSE2_IDCT_PERM:
4693         for(i=0; i<64; i++)
4694             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4695         break;
4696     default:
4697         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4698     }
4699 }
4700