apps/codecs/libwmapro/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file libavcodec/dsputil.c
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 //#include "simple_idct.h"
  33 //#include "faandct.h"
  34 //#include "faanidct.h"
  35 #include "mathops.h"
  36 //#include "mpegvideo.h"
  37 //#include "config.h"
  38 //#include "lpc.h"
  39 //#include "ac3dec.h"
  40 //#include "vorbis.h"
  41 //#include "png.h"
  42
  43 #if 0
  44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  45 uint32_t ff_squareTbl[512] = {0, };
  46
  47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  48 #define pb_7f (~0UL/255 * 0x7f)
  49 #define pb_80 (~0UL/255 * 0x80)
  50
  51 const uint8_t ff_zigzag_direct[64] = {
  52     0,   1,  8, 16,  9,  2,  3, 10,
  53     17, 24, 32, 25, 18, 11,  4,  5,
  54     12, 19, 26, 33, 40, 48, 41, 34,
  55     27, 20, 13,  6,  7, 14, 21, 28,
  56     35, 42, 49, 56, 57, 50, 43, 36,
  57     29, 22, 15, 23, 30, 37, 44, 51,
  58     58, 59, 52, 45, 38, 31, 39, 46,
  59     53, 60, 61, 54, 47, 55, 62, 63
  60 };
  61
  62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  63    specification, we interleave the fields */
  64 const uint8_t ff_zigzag248_direct[64] = {
  65      0,  8,  1,  9, 16, 24,  2, 10,
  66     17, 25, 32, 40, 48, 56, 33, 41,
  67     18, 26,  3, 11,  4, 12, 19, 27,
  68     34, 42, 49, 57, 50, 58, 35, 43,
  69     20, 28,  5, 13,  6, 14, 21, 29,
  70     36, 44, 51, 59, 52, 60, 37, 45,
  71     22, 30,  7, 15, 23, 31, 38, 46,
  72     53, 61, 54, 62, 39, 47, 55, 63,
  73 };
  74
  75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  77
  78 const uint8_t ff_alternate_horizontal_scan[64] = {
  79     0,  1,   2,  3,  8,  9, 16, 17,
  80     10, 11,  4,  5,  6,  7, 15, 14,
  81     13, 12, 19, 18, 24, 25, 32, 33,
  82     26, 27, 20, 21, 22, 23, 28, 29,
  83     30, 31, 34, 35, 40, 41, 48, 49,
  84     42, 43, 36, 37, 38, 39, 44, 45,
  85     46, 47, 50, 51, 56, 57, 58, 59,
  86     52, 53, 54, 55, 60, 61, 62, 63,
  87 };
  88
  89 const uint8_t ff_alternate_vertical_scan[64] = {
  90     0,  8,  16, 24,  1,  9,  2, 10,
  91     17, 25, 32, 40, 48, 56, 57, 49,
  92     41, 33, 26, 18,  3, 11,  4, 12,
  93     19, 27, 34, 42, 50, 58, 35, 43,
  94     51, 59, 20, 28,  5, 13,  6, 14,
  95     21, 29, 36, 44, 52, 60, 37, 45,
  96     53, 61, 22, 30,  7, 15, 23, 31,
  97     38, 46, 54, 62, 39, 47, 55, 63,
  98 };
  99
 100 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
 101  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
 102 const uint32_t ff_inverse[257]={
 103          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 104  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 105  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 106  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 107  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 108  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 109   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 110   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 111   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 112   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 113   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 114   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 115   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 116   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 117   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 118   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 119   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 120   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 121   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 122   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 123   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 124   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 125   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 126   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 127   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 128   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 129   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 130   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 131   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 132   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 133   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 134   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 135   16777216
 136 };
 137
 138 /* Input permutation for the simple_idct_mmx */
 139 static const uint8_t simple_mmx_permutation[64]={
 140         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 141         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 142         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 143         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 144         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 145         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 146         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 147         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 148 };
 149
 150 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 151
 152 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 153     int i;
 154     int end;
 155
 156     st->scantable= src_scantable;
 157
 158     for(i=0; i<64; i++){
 159         int j;
 160         j = src_scantable[i];
 161         st->permutated[i] = permutation[j];
 162 #if ARCH_PPC
 163         st->inverse[j] = i;
 164 #endif
 165     }
 166
 167     end=-1;
 168     for(i=0; i<64; i++){
 169         int j;
 170         j = st->permutated[i];
 171         if(j>end) end=j;
 172         st->raster_end[i]= end;
 173     }
 174 }
 175
 176 static int pix_sum_c(uint8_t * pix, int line_size)
 177 {
 178     int s, i, j;
 179
 180     s = 0;
 181     for (i = 0; i < 16; i++) {
 182         for (j = 0; j < 16; j += 8) {
 183             s += pix[0];
 184             s += pix[1];
 185             s += pix[2];
 186             s += pix[3];
 187             s += pix[4];
 188             s += pix[5];
 189             s += pix[6];
 190             s += pix[7];
 191             pix += 8;
 192         }
 193         pix += line_size - 16;
 194     }
 195     return s;
 196 }
 197
 198 static int pix_norm1_c(uint8_t * pix, int line_size)
 199 {
 200     int s, i, j;
 201     uint32_t *sq = ff_squareTbl + 256;
 202
 203     s = 0;
 204     for (i = 0; i < 16; i++) {
 205         for (j = 0; j < 16; j += 8) {
 206 #if 0
 207             s += sq[pix[0]];
 208             s += sq[pix[1]];
 209             s += sq[pix[2]];
 210             s += sq[pix[3]];
 211             s += sq[pix[4]];
 212             s += sq[pix[5]];
 213             s += sq[pix[6]];
 214             s += sq[pix[7]];
 215 #else
 216 #if LONG_MAX > 2147483647
 217             register uint64_t x=*(uint64_t*)pix;
 218             s += sq[x&0xff];
 219             s += sq[(x>>8)&0xff];
 220             s += sq[(x>>16)&0xff];
 221             s += sq[(x>>24)&0xff];
 222             s += sq[(x>>32)&0xff];
 223             s += sq[(x>>40)&0xff];
 224             s += sq[(x>>48)&0xff];
 225             s += sq[(x>>56)&0xff];
 226 #else
 227             register uint32_t x=*(uint32_t*)pix;
 228             s += sq[x&0xff];
 229             s += sq[(x>>8)&0xff];
 230             s += sq[(x>>16)&0xff];
 231             s += sq[(x>>24)&0xff];
 232             x=*(uint32_t*)(pix+4);
 233             s += sq[x&0xff];
 234             s += sq[(x>>8)&0xff];
 235             s += sq[(x>>16)&0xff];
 236             s += sq[(x>>24)&0xff];
 237 #endif
 238 #endif
 239             pix += 8;
 240         }
 241         pix += line_size - 16;
 242     }
 243     return s;
 244 }
 245
 246 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 247     int i;
 248
 249     for(i=0; i+8<=w; i+=8){
 250         dst[i+0]= bswap_32(src[i+0]);
 251         dst[i+1]= bswap_32(src[i+1]);
 252         dst[i+2]= bswap_32(src[i+2]);
 253         dst[i+3]= bswap_32(src[i+3]);
 254         dst[i+4]= bswap_32(src[i+4]);
 255         dst[i+5]= bswap_32(src[i+5]);
 256         dst[i+6]= bswap_32(src[i+6]);
 257         dst[i+7]= bswap_32(src[i+7]);
 258     }
 259     for(;i<w; i++){
 260         dst[i+0]= bswap_32(src[i+0]);
 261     }
 262 }
 263
 264 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 265 {
 266     int s, i;
 267     uint32_t *sq = ff_squareTbl + 256;
 268
 269     s = 0;
 270     for (i = 0; i < h; i++) {
 271         s += sq[pix1[0] - pix2[0]];
 272         s += sq[pix1[1] - pix2[1]];
 273         s += sq[pix1[2] - pix2[2]];
 274         s += sq[pix1[3] - pix2[3]];
 275         pix1 += line_size;
 276         pix2 += line_size;
 277     }
 278     return s;
 279 }
 280
 281 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 282 {
 283     int s, i;
 284     uint32_t *sq = ff_squareTbl + 256;
 285
 286     s = 0;
 287     for (i = 0; i < h; i++) {
 288         s += sq[pix1[0] - pix2[0]];
 289         s += sq[pix1[1] - pix2[1]];
 290         s += sq[pix1[2] - pix2[2]];
 291         s += sq[pix1[3] - pix2[3]];
 292         s += sq[pix1[4] - pix2[4]];
 293         s += sq[pix1[5] - pix2[5]];
 294         s += sq[pix1[6] - pix2[6]];
 295         s += sq[pix1[7] - pix2[7]];
 296         pix1 += line_size;
 297         pix2 += line_size;
 298     }
 299     return s;
 300 }
 301
 302 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 303 {
 304     int s, i;
 305     uint32_t *sq = ff_squareTbl + 256;
 306
 307     s = 0;
 308     for (i = 0; i < h; i++) {
 309         s += sq[pix1[ 0] - pix2[ 0]];
 310         s += sq[pix1[ 1] - pix2[ 1]];
 311         s += sq[pix1[ 2] - pix2[ 2]];
 312         s += sq[pix1[ 3] - pix2[ 3]];
 313         s += sq[pix1[ 4] - pix2[ 4]];
 314         s += sq[pix1[ 5] - pix2[ 5]];
 315         s += sq[pix1[ 6] - pix2[ 6]];
 316         s += sq[pix1[ 7] - pix2[ 7]];
 317         s += sq[pix1[ 8] - pix2[ 8]];
 318         s += sq[pix1[ 9] - pix2[ 9]];
 319         s += sq[pix1[10] - pix2[10]];
 320         s += sq[pix1[11] - pix2[11]];
 321         s += sq[pix1[12] - pix2[12]];
 322         s += sq[pix1[13] - pix2[13]];
 323         s += sq[pix1[14] - pix2[14]];
 324         s += sq[pix1[15] - pix2[15]];
 325
 326         pix1 += line_size;
 327         pix2 += line_size;
 328     }
 329     return s;
 330 }
 331
 332 /* draw the edges of width 'w' of an image of size width, height */
 333 //FIXME check that this is ok for mpeg4 interlaced
 334 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 335 {
 336     uint8_t *ptr, *last_line;
 337     int i;
 338
 339     last_line = buf + (height - 1) * wrap;
 340     for(i=0;i<w;i++) {
 341         /* top and bottom */
 342         memcpy(buf - (i + 1) * wrap, buf, width);
 343         memcpy(last_line + (i + 1) * wrap, last_line, width);
 344     }
 345     /* left and right */
 346     ptr = buf;
 347     for(i=0;i<height;i++) {
 348         memset(ptr - w, ptr[0], w);
 349         memset(ptr + width, ptr[width-1], w);
 350         ptr += wrap;
 351     }
 352     /* corners */
 353     for(i=0;i<w;i++) {
 354         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 355         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 356         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 357         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 358     }
 359 }
 360
 361 /**
 362  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 363  * @param buf destination buffer
 364  * @param src source buffer
 365  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 366  * @param block_w width of block
 367  * @param block_h height of block
 368  * @param src_x x coordinate of the top left sample of the block in the source buffer
 369  * @param src_y y coordinate of the top left sample of the block in the source buffer
 370  * @param w width of the source buffer
 371  * @param h height of the source buffer
 372  */
 373 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 374                                     int src_x, int src_y, int w, int h){
 375     int x, y;
 376     int start_y, start_x, end_y, end_x;
 377
 378     if(src_y>= h){
 379         src+= (h-1-src_y)*linesize;
 380         src_y=h-1;
 381     }else if(src_y<=-block_h){
 382         src+= (1-block_h-src_y)*linesize;
 383         src_y=1-block_h;
 384     }
 385     if(src_x>= w){
 386         src+= (w-1-src_x);
 387         src_x=w-1;
 388     }else if(src_x<=-block_w){
 389         src+= (1-block_w-src_x);
 390         src_x=1-block_w;
 391     }
 392
 393     start_y= FFMAX(0, -src_y);
 394     start_x= FFMAX(0, -src_x);
 395     end_y= FFMIN(block_h, h-src_y);
 396     end_x= FFMIN(block_w, w-src_x);
 397
 398     // copy existing part
 399     for(y=start_y; y<end_y; y++){
 400         for(x=start_x; x<end_x; x++){
 401             buf[x + y*linesize]= src[x + y*linesize];
 402         }
 403     }
 404
 405     //top
 406     for(y=0; y<start_y; y++){
 407         for(x=start_x; x<end_x; x++){
 408             buf[x + y*linesize]= buf[x + start_y*linesize];
 409         }
 410     }
 411
 412     //bottom
 413     for(y=end_y; y<block_h; y++){
 414         for(x=start_x; x<end_x; x++){
 415             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 416         }
 417     }
 418
 419     for(y=0; y<block_h; y++){
 420        //left
 421         for(x=0; x<start_x; x++){
 422             buf[x + y*linesize]= buf[start_x + y*linesize];
 423         }
 424
 425        //right
 426         for(x=end_x; x<block_w; x++){
 427             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 428         }
 429     }
 430 }
 431
 432 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 433 {
 434     int i;
 435
 436     /* read the pixels */
 437     for(i=0;i<8;i++) {
 438         block[0] = pixels[0];
 439         block[1] = pixels[1];
 440         block[2] = pixels[2];
 441         block[3] = pixels[3];
 442         block[4] = pixels[4];
 443         block[5] = pixels[5];
 444         block[6] = pixels[6];
 445         block[7] = pixels[7];
 446         pixels += line_size;
 447         block += 8;
 448     }
 449 }
 450
 451 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 452                           const uint8_t *s2, int stride){
 453     int i;
 454
 455     /* read the pixels */
 456     for(i=0;i<8;i++) {
 457         block[0] = s1[0] - s2[0];
 458         block[1] = s1[1] - s2[1];
 459         block[2] = s1[2] - s2[2];
 460         block[3] = s1[3] - s2[3];
 461         block[4] = s1[4] - s2[4];
 462         block[5] = s1[5] - s2[5];
 463         block[6] = s1[6] - s2[6];
 464         block[7] = s1[7] - s2[7];
 465         s1 += stride;
 466         s2 += stride;
 467         block += 8;
 468     }
 469 }
 470
 471
 472 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 473                                  int line_size)
 474 {
 475     int i;
 476     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 477
 478     /* read the pixels */
 479     for(i=0;i<8;i++) {
 480         pixels[0] = cm[block[0]];
 481         pixels[1] = cm[block[1]];
 482         pixels[2] = cm[block[2]];
 483         pixels[3] = cm[block[3]];
 484         pixels[4] = cm[block[4]];
 485         pixels[5] = cm[block[5]];
 486         pixels[6] = cm[block[6]];
 487         pixels[7] = cm[block[7]];
 488
 489         pixels += line_size;
 490         block += 8;
 491     }
 492 }
 493
 494 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 495                                  int line_size)
 496 {
 497     int i;
 498     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 499
 500     /* read the pixels */
 501     for(i=0;i<4;i++) {
 502         pixels[0] = cm[block[0]];
 503         pixels[1] = cm[block[1]];
 504         pixels[2] = cm[block[2]];
 505         pixels[3] = cm[block[3]];
 506
 507         pixels += line_size;
 508         block += 8;
 509     }
 510 }
 511
 512 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 513                                  int line_size)
 514 {
 515     int i;
 516     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 517
 518     /* read the pixels */
 519     for(i=0;i<2;i++) {
 520         pixels[0] = cm[block[0]];
 521         pixels[1] = cm[block[1]];
 522
 523         pixels += line_size;
 524         block += 8;
 525     }
 526 }
 527
 528 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 529                                         uint8_t *restrict pixels,
 530                                         int line_size)
 531 {
 532     int i, j;
 533
 534     for (i = 0; i < 8; i++) {
 535         for (j = 0; j < 8; j++) {
 536             if (*block < -128)
 537                 *pixels = 0;
 538             else if (*block > 127)
 539                 *pixels = 255;
 540             else
 541                 *pixels = (uint8_t)(*block + 128);
 542             block++;
 543             pixels++;
 544         }
 545         pixels += (line_size - 8);
 546     }
 547 }
 548
 549 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 550                                     int line_size)
 551 {
 552     int i;
 553
 554     /* read the pixels */
 555     for(i=0;i<8;i++) {
 556         pixels[0] = block[0];
 557         pixels[1] = block[1];
 558         pixels[2] = block[2];
 559         pixels[3] = block[3];
 560         pixels[4] = block[4];
 561         pixels[5] = block[5];
 562         pixels[6] = block[6];
 563         pixels[7] = block[7];
 564
 565         pixels += line_size;
 566         block += 8;
 567     }
 568 }
 569
 570 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 571                           int line_size)
 572 {
 573     int i;
 574     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 575
 576     /* read the pixels */
 577     for(i=0;i<8;i++) {
 578         pixels[0] = cm[pixels[0] + block[0]];
 579         pixels[1] = cm[pixels[1] + block[1]];
 580         pixels[2] = cm[pixels[2] + block[2]];
 581         pixels[3] = cm[pixels[3] + block[3]];
 582         pixels[4] = cm[pixels[4] + block[4]];
 583         pixels[5] = cm[pixels[5] + block[5]];
 584         pixels[6] = cm[pixels[6] + block[6]];
 585         pixels[7] = cm[pixels[7] + block[7]];
 586         pixels += line_size;
 587         block += 8;
 588     }
 589 }
 590
 591 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 592                           int line_size)
 593 {
 594     int i;
 595     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 596
 597     /* read the pixels */
 598     for(i=0;i<4;i++) {
 599         pixels[0] = cm[pixels[0] + block[0]];
 600         pixels[1] = cm[pixels[1] + block[1]];
 601         pixels[2] = cm[pixels[2] + block[2]];
 602         pixels[3] = cm[pixels[3] + block[3]];
 603         pixels += line_size;
 604         block += 8;
 605     }
 606 }
 607
 608 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 609                           int line_size)
 610 {
 611     int i;
 612     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 613
 614     /* read the pixels */
 615     for(i=0;i<2;i++) {
 616         pixels[0] = cm[pixels[0] + block[0]];
 617         pixels[1] = cm[pixels[1] + block[1]];
 618         pixels += line_size;
 619         block += 8;
 620     }
 621 }
 622
 623 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 624 {
 625     int i;
 626     for(i=0;i<8;i++) {
 627         pixels[0] += block[0];
 628         pixels[1] += block[1];
 629         pixels[2] += block[2];
 630         pixels[3] += block[3];
 631         pixels[4] += block[4];
 632         pixels[5] += block[5];
 633         pixels[6] += block[6];
 634         pixels[7] += block[7];
 635         pixels += line_size;
 636         block += 8;
 637     }
 638 }
 639
 640 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 641 {
 642     int i;
 643     for(i=0;i<4;i++) {
 644         pixels[0] += block[0];
 645         pixels[1] += block[1];
 646         pixels[2] += block[2];
 647         pixels[3] += block[3];
 648         pixels += line_size;
 649         block += 4;
 650     }
 651 }
 652
 653 static int sum_abs_dctelem_c(DCTELEM *block)
 654 {
 655     int sum=0, i;
 656     for(i=0; i<64; i++)
 657         sum+= FFABS(block[i]);
 658     return sum;
 659 }
 660
 661 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 662 {
 663     int i;
 664
 665     for (i = 0; i < h; i++) {
 666         memset(block, value, 16);
 667         block += line_size;
 668     }
 669 }
 670
 671 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 672 {
 673     int i;
 674
 675     for (i = 0; i < h; i++) {
 676         memset(block, value, 8);
 677         block += line_size;
 678     }
 679 }
 680
 681 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 682 {
 683     int i, j;
 684     uint16_t *dst1 = (uint16_t *) dst;
 685     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 686
 687     for (j = 0; j < 8; j++) {
 688         for (i = 0; i < 8; i++) {
 689             dst1[i] = dst2[i] = src[i] * 0x0101;
 690         }
 691         src  += 8;
 692         dst1 += linesize;
 693         dst2 += linesize;
 694     }
 695 }
 696
 697 #if 0
 698
 699 #define PIXOP2(OPNAME, OP) \
 700 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 701 {\
 702     int i;\
 703     for(i=0; i<h; i++){\
 704         OP(*((uint64_t*)block), AV_RN64(pixels));\
 705         pixels+=line_size;\
 706         block +=line_size;\
 707     }\
 708 }\
 709 \
 710 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 711 {\
 712     int i;\
 713     for(i=0; i<h; i++){\
 714         const uint64_t a= AV_RN64(pixels  );\
 715         const uint64_t b= AV_RN64(pixels+1);\
 716         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 717         pixels+=line_size;\
 718         block +=line_size;\
 719     }\
 720 }\
 721 \
 722 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 723 {\
 724     int i;\
 725     for(i=0; i<h; i++){\
 726         const uint64_t a= AV_RN64(pixels  );\
 727         const uint64_t b= AV_RN64(pixels+1);\
 728         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 729         pixels+=line_size;\
 730         block +=line_size;\
 731     }\
 732 }\
 733 \
 734 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 735 {\
 736     int i;\
 737     for(i=0; i<h; i++){\
 738         const uint64_t a= AV_RN64(pixels          );\
 739         const uint64_t b= AV_RN64(pixels+line_size);\
 740         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 741         pixels+=line_size;\
 742         block +=line_size;\
 743     }\
 744 }\
 745 \
 746 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 747 {\
 748     int i;\
 749     for(i=0; i<h; i++){\
 750         const uint64_t a= AV_RN64(pixels          );\
 751         const uint64_t b= AV_RN64(pixels+line_size);\
 752         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 753         pixels+=line_size;\
 754         block +=line_size;\
 755     }\
 756 }\
 757 \
 758 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 759 {\
 760         int i;\
 761         const uint64_t a= AV_RN64(pixels  );\
 762         const uint64_t b= AV_RN64(pixels+1);\
 763         uint64_t l0=  (a&0x0303030303030303ULL)\
 764                     + (b&0x0303030303030303ULL)\
 765                     + 0x0202020202020202ULL;\
 766         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 767                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 768         uint64_t l1,h1;\
 769 \
 770         pixels+=line_size;\
 771         for(i=0; i<h; i+=2){\
 772             uint64_t a= AV_RN64(pixels  );\
 773             uint64_t b= AV_RN64(pixels+1);\
 774             l1=  (a&0x0303030303030303ULL)\
 775                + (b&0x0303030303030303ULL);\
 776             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 777               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 778             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 779             pixels+=line_size;\
 780             block +=line_size;\
 781             a= AV_RN64(pixels  );\
 782             b= AV_RN64(pixels+1);\
 783             l0=  (a&0x0303030303030303ULL)\
 784                + (b&0x0303030303030303ULL)\
 785                + 0x0202020202020202ULL;\
 786             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 787               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 788             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 789             pixels+=line_size;\
 790             block +=line_size;\
 791         }\
 792 }\
 793 \
 794 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 795 {\
 796         int i;\
 797         const uint64_t a= AV_RN64(pixels  );\
 798         const uint64_t b= AV_RN64(pixels+1);\
 799         uint64_t l0=  (a&0x0303030303030303ULL)\
 800                     + (b&0x0303030303030303ULL)\
 801                     + 0x0101010101010101ULL;\
 802         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 803                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 804         uint64_t l1,h1;\
 805 \
 806         pixels+=line_size;\
 807         for(i=0; i<h; i+=2){\
 808             uint64_t a= AV_RN64(pixels  );\
 809             uint64_t b= AV_RN64(pixels+1);\
 810             l1=  (a&0x0303030303030303ULL)\
 811                + (b&0x0303030303030303ULL);\
 812             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 813               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 814             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 815             pixels+=line_size;\
 816             block +=line_size;\
 817             a= AV_RN64(pixels  );\
 818             b= AV_RN64(pixels+1);\
 819             l0=  (a&0x0303030303030303ULL)\
 820                + (b&0x0303030303030303ULL)\
 821                + 0x0101010101010101ULL;\
 822             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 823               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 824             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 825             pixels+=line_size;\
 826             block +=line_size;\
 827         }\
 828 }\
 829 \
 830 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 831 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 832 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 833 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 834 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 835 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 836 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 837
 838 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 839 #else // 64 bit variant
 840
 841 #define PIXOP2(OPNAME, OP) \
 842 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 843     int i;\
 844     for(i=0; i<h; i++){\
 845         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 846         pixels+=line_size;\
 847         block +=line_size;\
 848     }\
 849 }\
 850 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 851     int i;\
 852     for(i=0; i<h; i++){\
 853         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 854         pixels+=line_size;\
 855         block +=line_size;\
 856     }\
 857 }\
 858 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 859     int i;\
 860     for(i=0; i<h; i++){\
 861         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 862         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 863         pixels+=line_size;\
 864         block +=line_size;\
 865     }\
 866 }\
 867 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 868     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 869 }\
 870 \
 871 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 872                                                 int src_stride1, int src_stride2, int h){\
 873     int i;\
 874     for(i=0; i<h; i++){\
 875         uint32_t a,b;\
 876         a= AV_RN32(&src1[i*src_stride1  ]);\
 877         b= AV_RN32(&src2[i*src_stride2  ]);\
 878         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 879         a= AV_RN32(&src1[i*src_stride1+4]);\
 880         b= AV_RN32(&src2[i*src_stride2+4]);\
 881         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 882     }\
 883 }\
 884 \
 885 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 886                                                 int src_stride1, int src_stride2, int h){\
 887     int i;\
 888     for(i=0; i<h; i++){\
 889         uint32_t a,b;\
 890         a= AV_RN32(&src1[i*src_stride1  ]);\
 891         b= AV_RN32(&src2[i*src_stride2  ]);\
 892         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 893         a= AV_RN32(&src1[i*src_stride1+4]);\
 894         b= AV_RN32(&src2[i*src_stride2+4]);\
 895         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 896     }\
 897 }\
 898 \
 899 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 900                                                 int src_stride1, int src_stride2, int h){\
 901     int i;\
 902     for(i=0; i<h; i++){\
 903         uint32_t a,b;\
 904         a= AV_RN32(&src1[i*src_stride1  ]);\
 905         b= AV_RN32(&src2[i*src_stride2  ]);\
 906         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 907     }\
 908 }\
 909 \
 910 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 911                                                 int src_stride1, int src_stride2, int h){\
 912     int i;\
 913     for(i=0; i<h; i++){\
 914         uint32_t a,b;\
 915         a= AV_RN16(&src1[i*src_stride1  ]);\
 916         b= AV_RN16(&src2[i*src_stride2  ]);\
 917         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 918     }\
 919 }\
 920 \
 921 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 922                                                 int src_stride1, int src_stride2, int h){\
 923     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 924     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 925 }\
 926 \
 927 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 928                                                 int src_stride1, int src_stride2, int h){\
 929     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 930     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 931 }\
 932 \
 933 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 934     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 935 }\
 936 \
 937 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 938     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 939 }\
 940 \
 941 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 942     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 943 }\
 944 \
 945 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 946     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 947 }\
 948 \
 949 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 950                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 951     int i;\
 952     for(i=0; i<h; i++){\
 953         uint32_t a, b, c, d, l0, l1, h0, h1;\
 954         a= AV_RN32(&src1[i*src_stride1]);\
 955         b= AV_RN32(&src2[i*src_stride2]);\
 956         c= AV_RN32(&src3[i*src_stride3]);\
 957         d= AV_RN32(&src4[i*src_stride4]);\
 958         l0=  (a&0x03030303UL)\
 959            + (b&0x03030303UL)\
 960            + 0x02020202UL;\
 961         h0= ((a&0xFCFCFCFCUL)>>2)\
 962           + ((b&0xFCFCFCFCUL)>>2);\
 963         l1=  (c&0x03030303UL)\
 964            + (d&0x03030303UL);\
 965         h1= ((c&0xFCFCFCFCUL)>>2)\
 966           + ((d&0xFCFCFCFCUL)>>2);\
 967         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 968         a= AV_RN32(&src1[i*src_stride1+4]);\
 969         b= AV_RN32(&src2[i*src_stride2+4]);\
 970         c= AV_RN32(&src3[i*src_stride3+4]);\
 971         d= AV_RN32(&src4[i*src_stride4+4]);\
 972         l0=  (a&0x03030303UL)\
 973            + (b&0x03030303UL)\
 974            + 0x02020202UL;\
 975         h0= ((a&0xFCFCFCFCUL)>>2)\
 976           + ((b&0xFCFCFCFCUL)>>2);\
 977         l1=  (c&0x03030303UL)\
 978            + (d&0x03030303UL);\
 979         h1= ((c&0xFCFCFCFCUL)>>2)\
 980           + ((d&0xFCFCFCFCUL)>>2);\
 981         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 982     }\
 983 }\
 984 \
 985 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 986     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 987 }\
 988 \
 989 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 990     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 991 }\
 992 \
 993 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 994     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 995 }\
 996 \
 997 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 998     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 999 }\
1000 \
1001 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1002                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1003     int i;\
1004     for(i=0; i<h; i++){\
1005         uint32_t a, b, c, d, l0, l1, h0, h1;\
1006         a= AV_RN32(&src1[i*src_stride1]);\
1007         b= AV_RN32(&src2[i*src_stride2]);\
1008         c= AV_RN32(&src3[i*src_stride3]);\
1009         d= AV_RN32(&src4[i*src_stride4]);\
1010         l0=  (a&0x03030303UL)\
1011            + (b&0x03030303UL)\
1012            + 0x01010101UL;\
1013         h0= ((a&0xFCFCFCFCUL)>>2)\
1014           + ((b&0xFCFCFCFCUL)>>2);\
1015         l1=  (c&0x03030303UL)\
1016            + (d&0x03030303UL);\
1017         h1= ((c&0xFCFCFCFCUL)>>2)\
1018           + ((d&0xFCFCFCFCUL)>>2);\
1019         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1020         a= AV_RN32(&src1[i*src_stride1+4]);\
1021         b= AV_RN32(&src2[i*src_stride2+4]);\
1022         c= AV_RN32(&src3[i*src_stride3+4]);\
1023         d= AV_RN32(&src4[i*src_stride4+4]);\
1024         l0=  (a&0x03030303UL)\
1025            + (b&0x03030303UL)\
1026            + 0x01010101UL;\
1027         h0= ((a&0xFCFCFCFCUL)>>2)\
1028           + ((b&0xFCFCFCFCUL)>>2);\
1029         l1=  (c&0x03030303UL)\
1030            + (d&0x03030303UL);\
1031         h1= ((c&0xFCFCFCFCUL)>>2)\
1032           + ((d&0xFCFCFCFCUL)>>2);\
1033         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034     }\
1035 }\
1036 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1037                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1038     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1040 }\
1041 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1042                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1043     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1045 }\
1046 \
1047 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1048 {\
1049         int i, a0, b0, a1, b1;\
1050         a0= pixels[0];\
1051         b0= pixels[1] + 2;\
1052         a0 += b0;\
1053         b0 += pixels[2];\
1054 \
1055         pixels+=line_size;\
1056         for(i=0; i<h; i+=2){\
1057             a1= pixels[0];\
1058             b1= pixels[1];\
1059             a1 += b1;\
1060             b1 += pixels[2];\
1061 \
1062             block[0]= (a1+a0)>>2; /* FIXME non put */\
1063             block[1]= (b1+b0)>>2;\
1064 \
1065             pixels+=line_size;\
1066             block +=line_size;\
1067 \
1068             a0= pixels[0];\
1069             b0= pixels[1] + 2;\
1070             a0 += b0;\
1071             b0 += pixels[2];\
1072 \
1073             block[0]= (a1+a0)>>2;\
1074             block[1]= (b1+b0)>>2;\
1075             pixels+=line_size;\
1076             block +=line_size;\
1077         }\
1078 }\
1079 \
1080 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1081 {\
1082         int i;\
1083         const uint32_t a= AV_RN32(pixels  );\
1084         const uint32_t b= AV_RN32(pixels+1);\
1085         uint32_t l0=  (a&0x03030303UL)\
1086                     + (b&0x03030303UL)\
1087                     + 0x02020202UL;\
1088         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089                    + ((b&0xFCFCFCFCUL)>>2);\
1090         uint32_t l1,h1;\
1091 \
1092         pixels+=line_size;\
1093         for(i=0; i<h; i+=2){\
1094             uint32_t a= AV_RN32(pixels  );\
1095             uint32_t b= AV_RN32(pixels+1);\
1096             l1=  (a&0x03030303UL)\
1097                + (b&0x03030303UL);\
1098             h1= ((a&0xFCFCFCFCUL)>>2)\
1099               + ((b&0xFCFCFCFCUL)>>2);\
1100             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1101             pixels+=line_size;\
1102             block +=line_size;\
1103             a= AV_RN32(pixels  );\
1104             b= AV_RN32(pixels+1);\
1105             l0=  (a&0x03030303UL)\
1106                + (b&0x03030303UL)\
1107                + 0x02020202UL;\
1108             h0= ((a&0xFCFCFCFCUL)>>2)\
1109               + ((b&0xFCFCFCFCUL)>>2);\
1110             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1111             pixels+=line_size;\
1112             block +=line_size;\
1113         }\
1114 }\
1115 \
1116 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1117 {\
1118     int j;\
1119     for(j=0; j<2; j++){\
1120         int i;\
1121         const uint32_t a= AV_RN32(pixels  );\
1122         const uint32_t b= AV_RN32(pixels+1);\
1123         uint32_t l0=  (a&0x03030303UL)\
1124                     + (b&0x03030303UL)\
1125                     + 0x02020202UL;\
1126         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1127                    + ((b&0xFCFCFCFCUL)>>2);\
1128         uint32_t l1,h1;\
1129 \
1130         pixels+=line_size;\
1131         for(i=0; i<h; i+=2){\
1132             uint32_t a= AV_RN32(pixels  );\
1133             uint32_t b= AV_RN32(pixels+1);\
1134             l1=  (a&0x03030303UL)\
1135                + (b&0x03030303UL);\
1136             h1= ((a&0xFCFCFCFCUL)>>2)\
1137               + ((b&0xFCFCFCFCUL)>>2);\
1138             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1139             pixels+=line_size;\
1140             block +=line_size;\
1141             a= AV_RN32(pixels  );\
1142             b= AV_RN32(pixels+1);\
1143             l0=  (a&0x03030303UL)\
1144                + (b&0x03030303UL)\
1145                + 0x02020202UL;\
1146             h0= ((a&0xFCFCFCFCUL)>>2)\
1147               + ((b&0xFCFCFCFCUL)>>2);\
1148             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1149             pixels+=line_size;\
1150             block +=line_size;\
1151         }\
1152         pixels+=4-line_size*(h+1);\
1153         block +=4-line_size*h;\
1154     }\
1155 }\
1156 \
1157 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1158 {\
1159     int j;\
1160     for(j=0; j<2; j++){\
1161         int i;\
1162         const uint32_t a= AV_RN32(pixels  );\
1163         const uint32_t b= AV_RN32(pixels+1);\
1164         uint32_t l0=  (a&0x03030303UL)\
1165                     + (b&0x03030303UL)\
1166                     + 0x01010101UL;\
1167         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1168                    + ((b&0xFCFCFCFCUL)>>2);\
1169         uint32_t l1,h1;\
1170 \
1171         pixels+=line_size;\
1172         for(i=0; i<h; i+=2){\
1173             uint32_t a= AV_RN32(pixels  );\
1174             uint32_t b= AV_RN32(pixels+1);\
1175             l1=  (a&0x03030303UL)\
1176                + (b&0x03030303UL);\
1177             h1= ((a&0xFCFCFCFCUL)>>2)\
1178               + ((b&0xFCFCFCFCUL)>>2);\
1179             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1180             pixels+=line_size;\
1181             block +=line_size;\
1182             a= AV_RN32(pixels  );\
1183             b= AV_RN32(pixels+1);\
1184             l0=  (a&0x03030303UL)\
1185                + (b&0x03030303UL)\
1186                + 0x01010101UL;\
1187             h0= ((a&0xFCFCFCFCUL)>>2)\
1188               + ((b&0xFCFCFCFCUL)>>2);\
1189             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1190             pixels+=line_size;\
1191             block +=line_size;\
1192         }\
1193         pixels+=4-line_size*(h+1);\
1194         block +=4-line_size*h;\
1195     }\
1196 }\
1197 \
1198 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1199 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1200 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1201 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1202 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1203 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1204 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1205 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1206
1207 #define op_avg(a, b) a = rnd_avg32(a, b)
1208 #endif
1209 #define op_put(a, b) a = b
1210
1211 PIXOP2(avg, op_avg)
1212 PIXOP2(put, op_put)
1213 #undef op_avg
1214 #undef op_put
1215
1216 #define avg2(a,b) ((a+b+1)>>1)
1217 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1218
1219 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1220     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1221 }
1222
1223 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1224     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1225 }
1226
1227 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1228 {
1229     const int A=(16-x16)*(16-y16);
1230     const int B=(   x16)*(16-y16);
1231     const int C=(16-x16)*(   y16);
1232     const int D=(   x16)*(   y16);
1233     int i;
1234
1235     for(i=0; i<h; i++)
1236     {
1237         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1238         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1239         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1240         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1241         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1242         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1243         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1244         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1245         dst+= stride;
1246         src+= stride;
1247     }
1248 }
1249
1250 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1251                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1252 {
1253     int y, vx, vy;
1254     const int s= 1<<shift;
1255
1256     width--;
1257     height--;
1258
1259     for(y=0; y<h; y++){
1260         int x;
1261
1262         vx= ox;
1263         vy= oy;
1264         for(x=0; x<8; x++){ //XXX FIXME optimize
1265             int src_x, src_y, frac_x, frac_y, index;
1266
1267             src_x= vx>>16;
1268             src_y= vy>>16;
1269             frac_x= src_x&(s-1);
1270             frac_y= src_y&(s-1);
1271             src_x>>=shift;
1272             src_y>>=shift;
1273
1274             if((unsigned)src_x < width){
1275                 if((unsigned)src_y < height){
1276                     index= src_x + src_y*stride;
1277                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1278                                            + src[index       +1]*   frac_x )*(s-frac_y)
1279                                         + (  src[index+stride  ]*(s-frac_x)
1280                                            + src[index+stride+1]*   frac_x )*   frac_y
1281                                         + r)>>(shift*2);
1282                 }else{
1283                     index= src_x + av_clip(src_y, 0, height)*stride;
1284                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1285                                           + src[index       +1]*   frac_x )*s
1286                                         + r)>>(shift*2);
1287                 }
1288             }else{
1289                 if((unsigned)src_y < height){
1290                     index= av_clip(src_x, 0, width) + src_y*stride;
1291                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1292                                            + src[index+stride  ]*   frac_y )*s
1293                                         + r)>>(shift*2);
1294                 }else{
1295                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1296                     dst[y*stride + x]=    src[index         ];
1297                 }
1298             }
1299
1300             vx+= dxx;
1301             vy+= dyx;
1302         }
1303         ox += dxy;
1304         oy += dyy;
1305     }
1306 }
1307
1308 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309     switch(width){
1310     case 2: put_pixels2_c (dst, src, stride, height); break;
1311     case 4: put_pixels4_c (dst, src, stride, height); break;
1312     case 8: put_pixels8_c (dst, src, stride, height); break;
1313     case 16:put_pixels16_c(dst, src, stride, height); break;
1314     }
1315 }
1316
1317 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318     int i,j;
1319     for (i=0; i < height; i++) {
1320       for (j=0; j < width; j++) {
1321         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1322       }
1323       src += stride;
1324       dst += stride;
1325     }
1326 }
1327
1328 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329     int i,j;
1330     for (i=0; i < height; i++) {
1331       for (j=0; j < width; j++) {
1332         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1333       }
1334       src += stride;
1335       dst += stride;
1336     }
1337 }
1338
1339 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340     int i,j;
1341     for (i=0; i < height; i++) {
1342       for (j=0; j < width; j++) {
1343         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1344       }
1345       src += stride;
1346       dst += stride;
1347     }
1348 }
1349
1350 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351     int i,j;
1352     for (i=0; i < height; i++) {
1353       for (j=0; j < width; j++) {
1354         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1355       }
1356       src += stride;
1357       dst += stride;
1358     }
1359 }
1360
1361 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362     int i,j;
1363     for (i=0; i < height; i++) {
1364       for (j=0; j < width; j++) {
1365         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1366       }
1367       src += stride;
1368       dst += stride;
1369     }
1370 }
1371
1372 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373     int i,j;
1374     for (i=0; i < height; i++) {
1375       for (j=0; j < width; j++) {
1376         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1377       }
1378       src += stride;
1379       dst += stride;
1380     }
1381 }
1382
1383 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384     int i,j;
1385     for (i=0; i < height; i++) {
1386       for (j=0; j < width; j++) {
1387         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1388       }
1389       src += stride;
1390       dst += stride;
1391     }
1392 }
1393
1394 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1395     int i,j;
1396     for (i=0; i < height; i++) {
1397       for (j=0; j < width; j++) {
1398         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1399       }
1400       src += stride;
1401       dst += stride;
1402     }
1403 }
1404
1405 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406     switch(width){
1407     case 2: avg_pixels2_c (dst, src, stride, height); break;
1408     case 4: avg_pixels4_c (dst, src, stride, height); break;
1409     case 8: avg_pixels8_c (dst, src, stride, height); break;
1410     case 16:avg_pixels16_c(dst, src, stride, height); break;
1411     }
1412 }
1413
1414 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415     int i,j;
1416     for (i=0; i < height; i++) {
1417       for (j=0; j < width; j++) {
1418         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1419       }
1420       src += stride;
1421       dst += stride;
1422     }
1423 }
1424
1425 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426     int i,j;
1427     for (i=0; i < height; i++) {
1428       for (j=0; j < width; j++) {
1429         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1430       }
1431       src += stride;
1432       dst += stride;
1433     }
1434 }
1435
1436 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437     int i,j;
1438     for (i=0; i < height; i++) {
1439       for (j=0; j < width; j++) {
1440         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1441       }
1442       src += stride;
1443       dst += stride;
1444     }
1445 }
1446
1447 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448     int i,j;
1449     for (i=0; i < height; i++) {
1450       for (j=0; j < width; j++) {
1451         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1452       }
1453       src += stride;
1454       dst += stride;
1455     }
1456 }
1457
1458 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459     int i,j;
1460     for (i=0; i < height; i++) {
1461       for (j=0; j < width; j++) {
1462         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1463       }
1464       src += stride;
1465       dst += stride;
1466     }
1467 }
1468
1469 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1470     int i,j;
1471     for (i=0; i < height; i++) {
1472       for (j=0; j < width; j++) {
1473         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1474       }
1475       src += stride;
1476       dst += stride;
1477     }
1478 }
1479
1480 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1481     int i,j;
1482     for (i=0; i < height; i++) {
1483       for (j=0; j < width; j++) {
1484         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1485       }
1486       src += stride;
1487       dst += stride;
1488     }
1489 }
1490
1491 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1492     int i,j;
1493     for (i=0; i < height; i++) {
1494       for (j=0; j < width; j++) {
1495         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1496       }
1497       src += stride;
1498       dst += stride;
1499     }
1500 }
1501 #if 0
1502 #define TPEL_WIDTH(width)\
1503 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1504     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1505 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1506     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1507 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1508     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1509 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1510     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1511 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1512     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1513 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1514     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1515 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1516     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1517 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1518     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1519 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1520     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1521 #endif
1522
1523 #define H264_CHROMA_MC(OPNAME, OP)\
1524 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1525     const int A=(8-x)*(8-y);\
1526     const int B=(  x)*(8-y);\
1527     const int C=(8-x)*(  y);\
1528     const int D=(  x)*(  y);\
1529     int i;\
1530     \
1531     assert(x<8 && y<8 && x>=0 && y>=0);\
1532 \
1533     if(D){\
1534         for(i=0; i<h; i++){\
1535             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1536             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1537             dst+= stride;\
1538             src+= stride;\
1539         }\
1540     }else{\
1541         const int E= B+C;\
1542         const int step= C ? stride : 1;\
1543         for(i=0; i<h; i++){\
1544             OP(dst[0], (A*src[0] + E*src[step+0]));\
1545             OP(dst[1], (A*src[1] + E*src[step+1]));\
1546             dst+= stride;\
1547             src+= stride;\
1548         }\
1549     }\
1550 }\
1551 \
1552 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1553     const int A=(8-x)*(8-y);\
1554     const int B=(  x)*(8-y);\
1555     const int C=(8-x)*(  y);\
1556     const int D=(  x)*(  y);\
1557     int i;\
1558     \
1559     assert(x<8 && y<8 && x>=0 && y>=0);\
1560 \
1561     if(D){\
1562         for(i=0; i<h; i++){\
1563             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1564             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1565             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1566             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1567             dst+= stride;\
1568             src+= stride;\
1569         }\
1570     }else{\
1571         const int E= B+C;\
1572         const int step= C ? stride : 1;\
1573         for(i=0; i<h; i++){\
1574             OP(dst[0], (A*src[0] + E*src[step+0]));\
1575             OP(dst[1], (A*src[1] + E*src[step+1]));\
1576             OP(dst[2], (A*src[2] + E*src[step+2]));\
1577             OP(dst[3], (A*src[3] + E*src[step+3]));\
1578             dst+= stride;\
1579             src+= stride;\
1580         }\
1581     }\
1582 }\
1583 \
1584 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1585     const int A=(8-x)*(8-y);\
1586     const int B=(  x)*(8-y);\
1587     const int C=(8-x)*(  y);\
1588     const int D=(  x)*(  y);\
1589     int i;\
1590     \
1591     assert(x<8 && y<8 && x>=0 && y>=0);\
1592 \
1593     if(D){\
1594         for(i=0; i<h; i++){\
1595             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1596             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1597             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1598             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1599             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1600             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1601             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1602             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1603             dst+= stride;\
1604             src+= stride;\
1605         }\
1606     }else{\
1607         const int E= B+C;\
1608         const int step= C ? stride : 1;\
1609         for(i=0; i<h; i++){\
1610             OP(dst[0], (A*src[0] + E*src[step+0]));\
1611             OP(dst[1], (A*src[1] + E*src[step+1]));\
1612             OP(dst[2], (A*src[2] + E*src[step+2]));\
1613             OP(dst[3], (A*src[3] + E*src[step+3]));\
1614             OP(dst[4], (A*src[4] + E*src[step+4]));\
1615             OP(dst[5], (A*src[5] + E*src[step+5]));\
1616             OP(dst[6], (A*src[6] + E*src[step+6]));\
1617             OP(dst[7], (A*src[7] + E*src[step+7]));\
1618             dst+= stride;\
1619             src+= stride;\
1620         }\
1621     }\
1622 }
1623
1624 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1625 #define op_put(a, b) a = (((b) + 32)>>6)
1626
1627 H264_CHROMA_MC(put_       , op_put)
1628 H264_CHROMA_MC(avg_       , op_avg)
1629 #undef op_avg
1630 #undef op_put
1631
1632 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1633     const int A=(8-x)*(8-y);
1634     const int B=(  x)*(8-y);
1635     const int C=(8-x)*(  y);
1636     const int D=(  x)*(  y);
1637     int i;
1638
1639     assert(x<8 && y<8 && x>=0 && y>=0);
1640
1641     for(i=0; i<h; i++)
1642     {
1643         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1644         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1645         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1646         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1647         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1648         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1649         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1650         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1651         dst+= stride;
1652         src+= stride;
1653     }
1654 }
1655
1656 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1657     const int A=(8-x)*(8-y);
1658     const int B=(  x)*(8-y);
1659     const int C=(8-x)*(  y);
1660     const int D=(  x)*(  y);
1661     int i;
1662
1663     assert(x<8 && y<8 && x>=0 && y>=0);
1664
1665     for(i=0; i<h; i++)
1666     {
1667         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1668         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1669         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1670         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1671         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1672         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1673         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1674         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1675         dst+= stride;
1676         src+= stride;
1677     }
1678 }
1679
1680 #define QPEL_MC(r, OPNAME, RND, OP) \
1681 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1682     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1683     int i;\
1684     for(i=0; i<h; i++)\
1685     {\
1686         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1687         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1688         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1689         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1690         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1691         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1692         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1693         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1694         dst+=dstStride;\
1695         src+=srcStride;\
1696     }\
1697 }\
1698 \
1699 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1700     const int w=8;\
1701     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1702     int i;\
1703     for(i=0; i<w; i++)\
1704     {\
1705         const int src0= src[0*srcStride];\
1706         const int src1= src[1*srcStride];\
1707         const int src2= src[2*srcStride];\
1708         const int src3= src[3*srcStride];\
1709         const int src4= src[4*srcStride];\
1710         const int src5= src[5*srcStride];\
1711         const int src6= src[6*srcStride];\
1712         const int src7= src[7*srcStride];\
1713         const int src8= src[8*srcStride];\
1714         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1715         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1716         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1717         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1718         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1719         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1720         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1721         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1722         dst++;\
1723         src++;\
1724     }\
1725 }\
1726 \
1727 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1728     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1729     int i;\
1730     \
1731     for(i=0; i<h; i++)\
1732     {\
1733         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1734         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1735         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1736         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1737         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1738         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1739         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1740         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1741         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1742         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1743         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1744         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1745         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1746         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1747         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1748         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1749         dst+=dstStride;\
1750         src+=srcStride;\
1751     }\
1752 }\
1753 \
1754 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1755     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1756     int i;\
1757     const int w=16;\
1758     for(i=0; i<w; i++)\
1759     {\
1760         const int src0= src[0*srcStride];\
1761         const int src1= src[1*srcStride];\
1762         const int src2= src[2*srcStride];\
1763         const int src3= src[3*srcStride];\
1764         const int src4= src[4*srcStride];\
1765         const int src5= src[5*srcStride];\
1766         const int src6= src[6*srcStride];\
1767         const int src7= src[7*srcStride];\
1768         const int src8= src[8*srcStride];\
1769         const int src9= src[9*srcStride];\
1770         const int src10= src[10*srcStride];\
1771         const int src11= src[11*srcStride];\
1772         const int src12= src[12*srcStride];\
1773         const int src13= src[13*srcStride];\
1774         const int src14= src[14*srcStride];\
1775         const int src15= src[15*srcStride];\
1776         const int src16= src[16*srcStride];\
1777         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1778         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1779         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1780         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1781         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1782         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1783         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1784         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1785         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1786         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1787         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1788         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1789         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1790         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1791         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1792         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1793         dst++;\
1794         src++;\
1795     }\
1796 }\
1797 \
1798 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1799     OPNAME ## pixels8_c(dst, src, stride, 8);\
1800 }\
1801 \
1802 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1803     uint8_t half[64];\
1804     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1805     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1806 }\
1807 \
1808 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1809     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1810 }\
1811 \
1812 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1813     uint8_t half[64];\
1814     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1815     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1816 }\
1817 \
1818 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1819     uint8_t full[16*9];\
1820     uint8_t half[64];\
1821     copy_block9(full, src, 16, stride, 9);\
1822     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1823     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1824 }\
1825 \
1826 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1827     uint8_t full[16*9];\
1828     copy_block9(full, src, 16, stride, 9);\
1829     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1830 }\
1831 \
1832 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1833     uint8_t full[16*9];\
1834     uint8_t half[64];\
1835     copy_block9(full, src, 16, stride, 9);\
1836     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1837     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1838 }\
1839 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1840     uint8_t full[16*9];\
1841     uint8_t halfH[72];\
1842     uint8_t halfV[64];\
1843     uint8_t halfHV[64];\
1844     copy_block9(full, src, 16, stride, 9);\
1845     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1846     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1847     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1849 }\
1850 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1851     uint8_t full[16*9];\
1852     uint8_t halfH[72];\
1853     uint8_t halfHV[64];\
1854     copy_block9(full, src, 16, stride, 9);\
1855     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1857     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1859 }\
1860 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861     uint8_t full[16*9];\
1862     uint8_t halfH[72];\
1863     uint8_t halfV[64];\
1864     uint8_t halfHV[64];\
1865     copy_block9(full, src, 16, stride, 9);\
1866     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1867     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1868     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1869     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1870 }\
1871 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1872     uint8_t full[16*9];\
1873     uint8_t halfH[72];\
1874     uint8_t halfHV[64];\
1875     copy_block9(full, src, 16, stride, 9);\
1876     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1877     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1878     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1879     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1880 }\
1881 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882     uint8_t full[16*9];\
1883     uint8_t halfH[72];\
1884     uint8_t halfV[64];\
1885     uint8_t halfHV[64];\
1886     copy_block9(full, src, 16, stride, 9);\
1887     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1889     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1891 }\
1892 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1893     uint8_t full[16*9];\
1894     uint8_t halfH[72];\
1895     uint8_t halfHV[64];\
1896     copy_block9(full, src, 16, stride, 9);\
1897     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1898     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1899     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1900     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1901 }\
1902 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903     uint8_t full[16*9];\
1904     uint8_t halfH[72];\
1905     uint8_t halfV[64];\
1906     uint8_t halfHV[64];\
1907     copy_block9(full, src, 16, stride, 9);\
1908     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1909     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1910     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1912 }\
1913 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1914     uint8_t full[16*9];\
1915     uint8_t halfH[72];\
1916     uint8_t halfHV[64];\
1917     copy_block9(full, src, 16, stride, 9);\
1918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1919     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1920     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1921     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1922 }\
1923 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1924     uint8_t halfH[72];\
1925     uint8_t halfHV[64];\
1926     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1927     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1928     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1929 }\
1930 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1931     uint8_t halfH[72];\
1932     uint8_t halfHV[64];\
1933     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1934     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1935     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1936 }\
1937 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938     uint8_t full[16*9];\
1939     uint8_t halfH[72];\
1940     uint8_t halfV[64];\
1941     uint8_t halfHV[64];\
1942     copy_block9(full, src, 16, stride, 9);\
1943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1945     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1947 }\
1948 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1949     uint8_t full[16*9];\
1950     uint8_t halfH[72];\
1951     copy_block9(full, src, 16, stride, 9);\
1952     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1953     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1954     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1955 }\
1956 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957     uint8_t full[16*9];\
1958     uint8_t halfH[72];\
1959     uint8_t halfV[64];\
1960     uint8_t halfHV[64];\
1961     copy_block9(full, src, 16, stride, 9);\
1962     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1966 }\
1967 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1968     uint8_t full[16*9];\
1969     uint8_t halfH[72];\
1970     copy_block9(full, src, 16, stride, 9);\
1971     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1974 }\
1975 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1976     uint8_t halfH[72];\
1977     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1978     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1979 }\
1980 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1981     OPNAME ## pixels16_c(dst, src, stride, 16);\
1982 }\
1983 \
1984 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1985     uint8_t half[256];\
1986     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1987     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1988 }\
1989 \
1990 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1991     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1992 }\
1993 \
1994 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1995     uint8_t half[256];\
1996     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1997     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1998 }\
1999 \
2000 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2001     uint8_t full[24*17];\
2002     uint8_t half[256];\
2003     copy_block17(full, src, 24, stride, 17);\
2004     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2005     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2006 }\
2007 \
2008 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2009     uint8_t full[24*17];\
2010     copy_block17(full, src, 24, stride, 17);\
2011     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2012 }\
2013 \
2014 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2015     uint8_t full[24*17];\
2016     uint8_t half[256];\
2017     copy_block17(full, src, 24, stride, 17);\
2018     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2019     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2020 }\
2021 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2022     uint8_t full[24*17];\
2023     uint8_t halfH[272];\
2024     uint8_t halfV[256];\
2025     uint8_t halfHV[256];\
2026     copy_block17(full, src, 24, stride, 17);\
2027     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2029     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2031 }\
2032 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2033     uint8_t full[24*17];\
2034     uint8_t halfH[272];\
2035     uint8_t halfHV[256];\
2036     copy_block17(full, src, 24, stride, 17);\
2037     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2039     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2041 }\
2042 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043     uint8_t full[24*17];\
2044     uint8_t halfH[272];\
2045     uint8_t halfV[256];\
2046     uint8_t halfHV[256];\
2047     copy_block17(full, src, 24, stride, 17);\
2048     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2050     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2052 }\
2053 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2054     uint8_t full[24*17];\
2055     uint8_t halfH[272];\
2056     uint8_t halfHV[256];\
2057     copy_block17(full, src, 24, stride, 17);\
2058     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2059     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2060     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2061     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2062 }\
2063 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2064     uint8_t full[24*17];\
2065     uint8_t halfH[272];\
2066     uint8_t halfV[256];\
2067     uint8_t halfHV[256];\
2068     copy_block17(full, src, 24, stride, 17);\
2069     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2071     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2073 }\
2074 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2075     uint8_t full[24*17];\
2076     uint8_t halfH[272];\
2077     uint8_t halfHV[256];\
2078     copy_block17(full, src, 24, stride, 17);\
2079     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2080     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2081     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2082     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2083 }\
2084 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2085     uint8_t full[24*17];\
2086     uint8_t halfH[272];\
2087     uint8_t halfV[256];\
2088     uint8_t halfHV[256];\
2089     copy_block17(full, src, 24, stride, 17);\
2090     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2091     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2092     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2094 }\
2095 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2096     uint8_t full[24*17];\
2097     uint8_t halfH[272];\
2098     uint8_t halfHV[256];\
2099     copy_block17(full, src, 24, stride, 17);\
2100     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2101     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2102     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2103     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2104 }\
2105 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2106     uint8_t halfH[272];\
2107     uint8_t halfHV[256];\
2108     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2109     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2110     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2111 }\
2112 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2113     uint8_t halfH[272];\
2114     uint8_t halfHV[256];\
2115     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2116     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2117     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2118 }\
2119 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2120     uint8_t full[24*17];\
2121     uint8_t halfH[272];\
2122     uint8_t halfV[256];\
2123     uint8_t halfHV[256];\
2124     copy_block17(full, src, 24, stride, 17);\
2125     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2127     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2129 }\
2130 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2131     uint8_t full[24*17];\
2132     uint8_t halfH[272];\
2133     copy_block17(full, src, 24, stride, 17);\
2134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2135     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2136     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2137 }\
2138 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139     uint8_t full[24*17];\
2140     uint8_t halfH[272];\
2141     uint8_t halfV[256];\
2142     uint8_t halfHV[256];\
2143     copy_block17(full, src, 24, stride, 17);\
2144     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2148 }\
2149 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2150     uint8_t full[24*17];\
2151     uint8_t halfH[272];\
2152     copy_block17(full, src, 24, stride, 17);\
2153     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2156 }\
2157 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2158     uint8_t halfH[272];\
2159     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2160     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2161 }
2162
2163 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2164 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2165 #define op_put(a, b) a = cm[((b) + 16)>>5]
2166 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2167
2168 QPEL_MC(0, put_       , _       , op_put)
2169 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2170 QPEL_MC(0, avg_       , _       , op_avg)
2171 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2172 #undef op_avg
2173 #undef op_avg_no_rnd
2174 #undef op_put
2175 #undef op_put_no_rnd
2176
2177 #if 1
2178 #define H264_LOWPASS(OPNAME, OP, OP2) \
2179 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2180     const int h=2;\
2181     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182     int i;\
2183     for(i=0; i<h; i++)\
2184     {\
2185         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2186         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2187         dst+=dstStride;\
2188         src+=srcStride;\
2189     }\
2190 }\
2191 \
2192 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193     const int w=2;\
2194     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2195     int i;\
2196     for(i=0; i<w; i++)\
2197     {\
2198         const int srcB= src[-2*srcStride];\
2199         const int srcA= src[-1*srcStride];\
2200         const int src0= src[0 *srcStride];\
2201         const int src1= src[1 *srcStride];\
2202         const int src2= src[2 *srcStride];\
2203         const int src3= src[3 *srcStride];\
2204         const int src4= src[4 *srcStride];\
2205         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207         dst++;\
2208         src++;\
2209     }\
2210 }\
2211 \
2212 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2213     const int h=2;\
2214     const int w=2;\
2215     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216     int i;\
2217     src -= 2*srcStride;\
2218     for(i=0; i<h+5; i++)\
2219     {\
2220         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2221         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2222         tmp+=tmpStride;\
2223         src+=srcStride;\
2224     }\
2225     tmp -= tmpStride*(h+5-2);\
2226     for(i=0; i<w; i++)\
2227     {\
2228         const int tmpB= tmp[-2*tmpStride];\
2229         const int tmpA= tmp[-1*tmpStride];\
2230         const int tmp0= tmp[0 *tmpStride];\
2231         const int tmp1= tmp[1 *tmpStride];\
2232         const int tmp2= tmp[2 *tmpStride];\
2233         const int tmp3= tmp[3 *tmpStride];\
2234         const int tmp4= tmp[4 *tmpStride];\
2235         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2236         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2237         dst++;\
2238         tmp++;\
2239     }\
2240 }\
2241 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2242     const int h=4;\
2243     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2244     int i;\
2245     for(i=0; i<h; i++)\
2246     {\
2247         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2248         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2249         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2250         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2251         dst+=dstStride;\
2252         src+=srcStride;\
2253     }\
2254 }\
2255 \
2256 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2257     const int w=4;\
2258     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2259     int i;\
2260     for(i=0; i<w; i++)\
2261     {\
2262         const int srcB= src[-2*srcStride];\
2263         const int srcA= src[-1*srcStride];\
2264         const int src0= src[0 *srcStride];\
2265         const int src1= src[1 *srcStride];\
2266         const int src2= src[2 *srcStride];\
2267         const int src3= src[3 *srcStride];\
2268         const int src4= src[4 *srcStride];\
2269         const int src5= src[5 *srcStride];\
2270         const int src6= src[6 *srcStride];\
2271         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2272         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2273         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2274         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2275         dst++;\
2276         src++;\
2277     }\
2278 }\
2279 \
2280 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281     const int h=4;\
2282     const int w=4;\
2283     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284     int i;\
2285     src -= 2*srcStride;\
2286     for(i=0; i<h+5; i++)\
2287     {\
2288         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2289         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2290         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2291         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2292         tmp+=tmpStride;\
2293         src+=srcStride;\
2294     }\
2295     tmp -= tmpStride*(h+5-2);\
2296     for(i=0; i<w; i++)\
2297     {\
2298         const int tmpB= tmp[-2*tmpStride];\
2299         const int tmpA= tmp[-1*tmpStride];\
2300         const int tmp0= tmp[0 *tmpStride];\
2301         const int tmp1= tmp[1 *tmpStride];\
2302         const int tmp2= tmp[2 *tmpStride];\
2303         const int tmp3= tmp[3 *tmpStride];\
2304         const int tmp4= tmp[4 *tmpStride];\
2305         const int tmp5= tmp[5 *tmpStride];\
2306         const int tmp6= tmp[6 *tmpStride];\
2307         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2308         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2309         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2310         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2311         dst++;\
2312         tmp++;\
2313     }\
2314 }\
2315 \
2316 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317     const int h=8;\
2318     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2319     int i;\
2320     for(i=0; i<h; i++)\
2321     {\
2322         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2323         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2324         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2325         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2326         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2327         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2328         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2329         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2330         dst+=dstStride;\
2331         src+=srcStride;\
2332     }\
2333 }\
2334 \
2335 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336     const int w=8;\
2337     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338     int i;\
2339     for(i=0; i<w; i++)\
2340     {\
2341         const int srcB= src[-2*srcStride];\
2342         const int srcA= src[-1*srcStride];\
2343         const int src0= src[0 *srcStride];\
2344         const int src1= src[1 *srcStride];\
2345         const int src2= src[2 *srcStride];\
2346         const int src3= src[3 *srcStride];\
2347         const int src4= src[4 *srcStride];\
2348         const int src5= src[5 *srcStride];\
2349         const int src6= src[6 *srcStride];\
2350         const int src7= src[7 *srcStride];\
2351         const int src8= src[8 *srcStride];\
2352         const int src9= src[9 *srcStride];\
2353         const int src10=src[10*srcStride];\
2354         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2355         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2356         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2357         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2358         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2359         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2360         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2361         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2362         dst++;\
2363         src++;\
2364     }\
2365 }\
2366 \
2367 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368     const int h=8;\
2369     const int w=8;\
2370     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2371     int i;\
2372     src -= 2*srcStride;\
2373     for(i=0; i<h+5; i++)\
2374     {\
2375         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2376         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2377         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2378         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2379         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2380         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2381         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2382         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2383         tmp+=tmpStride;\
2384         src+=srcStride;\
2385     }\
2386     tmp -= tmpStride*(h+5-2);\
2387     for(i=0; i<w; i++)\
2388     {\
2389         const int tmpB= tmp[-2*tmpStride];\
2390         const int tmpA= tmp[-1*tmpStride];\
2391         const int tmp0= tmp[0 *tmpStride];\
2392         const int tmp1= tmp[1 *tmpStride];\
2393         const int tmp2= tmp[2 *tmpStride];\
2394         const int tmp3= tmp[3 *tmpStride];\
2395         const int tmp4= tmp[4 *tmpStride];\
2396         const int tmp5= tmp[5 *tmpStride];\
2397         const int tmp6= tmp[6 *tmpStride];\
2398         const int tmp7= tmp[7 *tmpStride];\
2399         const int tmp8= tmp[8 *tmpStride];\
2400         const int tmp9= tmp[9 *tmpStride];\
2401         const int tmp10=tmp[10*tmpStride];\
2402         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2403         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2404         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2405         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2406         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2407         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2408         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2409         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2410         dst++;\
2411         tmp++;\
2412     }\
2413 }\
2414 \
2415 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2416     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2417     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2418     src += 8*srcStride;\
2419     dst += 8*dstStride;\
2420     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2421     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2422 }\
2423 \
2424 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2425     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2426     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2427     src += 8*srcStride;\
2428     dst += 8*dstStride;\
2429     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2430     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2431 }\
2432 \
2433 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2434     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2435     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2436     src += 8*srcStride;\
2437     dst += 8*dstStride;\
2438     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2439     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2440 }\
2441
2442 #define H264_MC(OPNAME, SIZE) \
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2444     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2445 }\
2446 \
2447 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2448     uint8_t half[SIZE*SIZE];\
2449     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2450     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2451 }\
2452 \
2453 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2454     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2455 }\
2456 \
2457 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2458     uint8_t half[SIZE*SIZE];\
2459     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2460     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2461 }\
2462 \
2463 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2464     uint8_t full[SIZE*(SIZE+5)];\
2465     uint8_t * const full_mid= full + SIZE*2;\
2466     uint8_t half[SIZE*SIZE];\
2467     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2468     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2469     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2470 }\
2471 \
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2473     uint8_t full[SIZE*(SIZE+5)];\
2474     uint8_t * const full_mid= full + SIZE*2;\
2475     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2476     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2477 }\
2478 \
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2480     uint8_t full[SIZE*(SIZE+5)];\
2481     uint8_t * const full_mid= full + SIZE*2;\
2482     uint8_t half[SIZE*SIZE];\
2483     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2484     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2485     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2486 }\
2487 \
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2489     uint8_t full[SIZE*(SIZE+5)];\
2490     uint8_t * const full_mid= full + SIZE*2;\
2491     uint8_t halfH[SIZE*SIZE];\
2492     uint8_t halfV[SIZE*SIZE];\
2493     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2494     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2495     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2496     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2497 }\
2498 \
2499 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2500     uint8_t full[SIZE*(SIZE+5)];\
2501     uint8_t * const full_mid= full + SIZE*2;\
2502     uint8_t halfH[SIZE*SIZE];\
2503     uint8_t halfV[SIZE*SIZE];\
2504     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2505     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2506     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2507     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2508 }\
2509 \
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2511     uint8_t full[SIZE*(SIZE+5)];\
2512     uint8_t * const full_mid= full + SIZE*2;\
2513     uint8_t halfH[SIZE*SIZE];\
2514     uint8_t halfV[SIZE*SIZE];\
2515     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2516     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2517     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2519 }\
2520 \
2521 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2522     uint8_t full[SIZE*(SIZE+5)];\
2523     uint8_t * const full_mid= full + SIZE*2;\
2524     uint8_t halfH[SIZE*SIZE];\
2525     uint8_t halfV[SIZE*SIZE];\
2526     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2527     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2528     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2530 }\
2531 \
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2533     int16_t tmp[SIZE*(SIZE+5)];\
2534     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2535 }\
2536 \
2537 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2538     int16_t tmp[SIZE*(SIZE+5)];\
2539     uint8_t halfH[SIZE*SIZE];\
2540     uint8_t halfHV[SIZE*SIZE];\
2541     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2542     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2543     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2544 }\
2545 \
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2547     int16_t tmp[SIZE*(SIZE+5)];\
2548     uint8_t halfH[SIZE*SIZE];\
2549     uint8_t halfHV[SIZE*SIZE];\
2550     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2551     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2552     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2553 }\
2554 \
2555 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2556     uint8_t full[SIZE*(SIZE+5)];\
2557     uint8_t * const full_mid= full + SIZE*2;\
2558     int16_t tmp[SIZE*(SIZE+5)];\
2559     uint8_t halfV[SIZE*SIZE];\
2560     uint8_t halfHV[SIZE*SIZE];\
2561     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2562     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2563     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2564     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2565 }\
2566 \
2567 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2568     uint8_t full[SIZE*(SIZE+5)];\
2569     uint8_t * const full_mid= full + SIZE*2;\
2570     int16_t tmp[SIZE*(SIZE+5)];\
2571     uint8_t halfV[SIZE*SIZE];\
2572     uint8_t halfHV[SIZE*SIZE];\
2573     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2574     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2575     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2576     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2577 }\
2578
2579 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2580 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2581 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2582 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2583 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2584
2585 H264_LOWPASS(put_       , op_put, op2_put)
2586 H264_LOWPASS(avg_       , op_avg, op2_avg)
2587 H264_MC(put_, 2)
2588 H264_MC(put_, 4)
2589 H264_MC(put_, 8)
2590 H264_MC(put_, 16)
2591 H264_MC(avg_, 4)
2592 H264_MC(avg_, 8)
2593 H264_MC(avg_, 16)
2594
2595 #undef op_avg
2596 #undef op_put
2597 #undef op2_avg
2598 #undef op2_put
2599 #endif
2600
2601 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2602     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2603     int i;
2604
2605     for(i=0; i<h; i++){
2606         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2607         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2608         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2609         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2610         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2611         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2612         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2613         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2614         dst+=dstStride;
2615         src+=srcStride;
2616     }
2617 }
2618
2619 #if CONFIG_CAVS_DECODER
2620 /* AVS specific */
2621 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2622     put_pixels8_c(dst, src, stride, 8);
2623 }
2624 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2625     avg_pixels8_c(dst, src, stride, 8);
2626 }
2627 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2628     put_pixels16_c(dst, src, stride, 16);
2629 }
2630 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2631     avg_pixels16_c(dst, src, stride, 16);
2632 }
2633 #endif /* CONFIG_CAVS_DECODER */
2634
2635 #if CONFIG_VC1_DECODER
2636 /* VC-1 specific */
2637 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2638     put_pixels8_c(dst, src, stride, 8);
2639 }
2640 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2641     avg_pixels8_c(dst, src, stride, 8);
2642 }
2643 #endif /* CONFIG_VC1_DECODER */
2644
2645 #if CONFIG_RV40_DECODER
2646 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2647     put_pixels16_xy2_c(dst, src, stride, 16);
2648 }
2649 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2650     avg_pixels16_xy2_c(dst, src, stride, 16);
2651 }
2652 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2653     put_pixels8_xy2_c(dst, src, stride, 8);
2654 }
2655 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2656     avg_pixels8_xy2_c(dst, src, stride, 8);
2657 }
2658 #endif /* CONFIG_RV40_DECODER */
2659
2660 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2661     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2662     int i;
2663
2664     for(i=0; i<w; i++){
2665         const int src_1= src[ -srcStride];
2666         const int src0 = src[0          ];
2667         const int src1 = src[  srcStride];
2668         const int src2 = src[2*srcStride];
2669         const int src3 = src[3*srcStride];
2670         const int src4 = src[4*srcStride];
2671         const int src5 = src[5*srcStride];
2672         const int src6 = src[6*srcStride];
2673         const int src7 = src[7*srcStride];
2674         const int src8 = src[8*srcStride];
2675         const int src9 = src[9*srcStride];
2676         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2677         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2678         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2679         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2680         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2681         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2682         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2683         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2684         src++;
2685         dst++;
2686     }
2687 }
2688
2689 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2690     put_pixels8_c(dst, src, stride, 8);
2691 }
2692
2693 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2694     uint8_t half[64];
2695     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2696     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2697 }
2698
2699 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2700     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2701 }
2702
2703 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2704     uint8_t half[64];
2705     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2706     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2707 }
2708
2709 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2710     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2711 }
2712
2713 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2714     uint8_t halfH[88];
2715     uint8_t halfV[64];
2716     uint8_t halfHV[64];
2717     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2718     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2719     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2720     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2721 }
2722 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2723     uint8_t halfH[88];
2724     uint8_t halfV[64];
2725     uint8_t halfHV[64];
2726     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2727     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2728     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2729     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2730 }
2731 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2732     uint8_t halfH[88];
2733     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2734     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2735 }
2736
2737 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2738     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2739     int x;
2740     const int strength= ff_h263_loop_filter_strength[qscale];
2741
2742     for(x=0; x<8; x++){
2743         int d1, d2, ad1;
2744         int p0= src[x-2*stride];
2745         int p1= src[x-1*stride];
2746         int p2= src[x+0*stride];
2747         int p3= src[x+1*stride];
2748         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2749
2750         if     (d<-2*strength) d1= 0;
2751         else if(d<-  strength) d1=-2*strength - d;
2752         else if(d<   strength) d1= d;
2753         else if(d< 2*strength) d1= 2*strength - d;
2754         else                   d1= 0;
2755
2756         p1 += d1;
2757         p2 -= d1;
2758         if(p1&256) p1= ~(p1>>31);
2759         if(p2&256) p2= ~(p2>>31);
2760
2761         src[x-1*stride] = p1;
2762         src[x+0*stride] = p2;
2763
2764         ad1= FFABS(d1)>>1;
2765
2766         d2= av_clip((p0-p3)/4, -ad1, ad1);
2767
2768         src[x-2*stride] = p0 - d2;
2769         src[x+  stride] = p3 + d2;
2770     }
2771     }
2772 }
2773
2774 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2775     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2776     int y;
2777     const int strength= ff_h263_loop_filter_strength[qscale];
2778
2779     for(y=0; y<8; y++){
2780         int d1, d2, ad1;
2781         int p0= src[y*stride-2];
2782         int p1= src[y*stride-1];
2783         int p2= src[y*stride+0];
2784         int p3= src[y*stride+1];
2785         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2786
2787         if     (d<-2*strength) d1= 0;
2788         else if(d<-  strength) d1=-2*strength - d;
2789         else if(d<   strength) d1= d;
2790         else if(d< 2*strength) d1= 2*strength - d;
2791         else                   d1= 0;
2792
2793         p1 += d1;
2794         p2 -= d1;
2795         if(p1&256) p1= ~(p1>>31);
2796         if(p2&256) p2= ~(p2>>31);
2797
2798         src[y*stride-1] = p1;
2799         src[y*stride+0] = p2;
2800
2801         ad1= FFABS(d1)>>1;
2802
2803         d2= av_clip((p0-p3)/4, -ad1, ad1);
2804
2805         src[y*stride-2] = p0 - d2;
2806         src[y*stride+1] = p3 + d2;
2807     }
2808     }
2809 }
2810
2811 static void h261_loop_filter_c(uint8_t *src, int stride){
2812     int x,y,xy,yz;
2813     int temp[64];
2814
2815     for(x=0; x<8; x++){
2816         temp[x      ] = 4*src[x           ];
2817         temp[x + 7*8] = 4*src[x + 7*stride];
2818     }
2819     for(y=1; y<7; y++){
2820         for(x=0; x<8; x++){
2821             xy = y * stride + x;
2822             yz = y * 8 + x;
2823             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2824         }
2825     }
2826
2827     for(y=0; y<8; y++){
2828         src[  y*stride] = (temp[  y*8] + 2)>>2;
2829         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2830         for(x=1; x<7; x++){
2831             xy = y * stride + x;
2832             yz = y * 8 + x;
2833             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2834         }
2835     }
2836 }
2837
2838 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2839 {
2840     int s, i;
2841
2842     s = 0;
2843     for(i=0;i<h;i++) {
2844         s += abs(pix1[0] - pix2[0]);
2845         s += abs(pix1[1] - pix2[1]);
2846         s += abs(pix1[2] - pix2[2]);
2847         s += abs(pix1[3] - pix2[3]);
2848         s += abs(pix1[4] - pix2[4]);
2849         s += abs(pix1[5] - pix2[5]);
2850         s += abs(pix1[6] - pix2[6]);
2851         s += abs(pix1[7] - pix2[7]);
2852         s += abs(pix1[8] - pix2[8]);
2853         s += abs(pix1[9] - pix2[9]);
2854         s += abs(pix1[10] - pix2[10]);
2855         s += abs(pix1[11] - pix2[11]);
2856         s += abs(pix1[12] - pix2[12]);
2857         s += abs(pix1[13] - pix2[13]);
2858         s += abs(pix1[14] - pix2[14]);
2859         s += abs(pix1[15] - pix2[15]);
2860         pix1 += line_size;
2861         pix2 += line_size;
2862     }
2863     return s;
2864 }
2865
2866 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2867 {
2868     int s, i;
2869
2870     s = 0;
2871     for(i=0;i<h;i++) {
2872         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2873         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2874         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2875         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2876         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2877         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2878         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2879         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2880         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2881         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2882         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2883         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2884         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2885         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2886         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2887         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2888         pix1 += line_size;
2889         pix2 += line_size;
2890     }
2891     return s;
2892 }
2893
2894 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2895 {
2896     int s, i;
2897     uint8_t *pix3 = pix2 + line_size;
2898
2899     s = 0;
2900     for(i=0;i<h;i++) {
2901         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2902         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2903         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2904         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2905         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2906         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2907         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2908         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2909         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2910         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2911         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2912         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2913         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2914         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2915         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2916         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2917         pix1 += line_size;
2918         pix2 += line_size;
2919         pix3 += line_size;
2920     }
2921     return s;
2922 }
2923
2924 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2925 {
2926     int s, i;
2927     uint8_t *pix3 = pix2 + line_size;
2928
2929     s = 0;
2930     for(i=0;i<h;i++) {
2931         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2932         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2933         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2934         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2935         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2936         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2937         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2938         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2939         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2940         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2941         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2942         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2943         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2944         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2945         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2946         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2947         pix1 += line_size;
2948         pix2 += line_size;
2949         pix3 += line_size;
2950     }
2951     return s;
2952 }
2953
2954 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2955 {
2956     int s, i;
2957
2958     s = 0;
2959     for(i=0;i<h;i++) {
2960         s += abs(pix1[0] - pix2[0]);
2961         s += abs(pix1[1] - pix2[1]);
2962         s += abs(pix1[2] - pix2[2]);
2963         s += abs(pix1[3] - pix2[3]);
2964         s += abs(pix1[4] - pix2[4]);
2965         s += abs(pix1[5] - pix2[5]);
2966         s += abs(pix1[6] - pix2[6]);
2967         s += abs(pix1[7] - pix2[7]);
2968         pix1 += line_size;
2969         pix2 += line_size;
2970     }
2971     return s;
2972 }
2973
2974 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2975 {
2976     int s, i;
2977
2978     s = 0;
2979     for(i=0;i<h;i++) {
2980         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2981         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2982         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2983         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2984         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2985         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2986         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2987         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2988         pix1 += line_size;
2989         pix2 += line_size;
2990     }
2991     return s;
2992 }
2993
2994 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2995 {
2996     int s, i;
2997     uint8_t *pix3 = pix2 + line_size;
2998
2999     s = 0;
3000     for(i=0;i<h;i++) {
3001         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3002         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3003         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3004         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3005         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3006         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3007         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3008         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3009         pix1 += line_size;
3010         pix2 += line_size;
3011         pix3 += line_size;
3012     }
3013     return s;
3014 }
3015
3016 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3017 {
3018     int s, i;
3019     uint8_t *pix3 = pix2 + line_size;
3020
3021     s = 0;
3022     for(i=0;i<h;i++) {
3023         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3024         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3025         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3026         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3027         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3028         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3029         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3030         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3031         pix1 += line_size;
3032         pix2 += line_size;
3033         pix3 += line_size;
3034     }
3035     return s;
3036 }
3037
3038 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3039     MpegEncContext *c = v;
3040     int score1=0;
3041     int score2=0;
3042     int x,y;
3043
3044     for(y=0; y<h; y++){
3045         for(x=0; x<16; x++){
3046             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3047         }
3048         if(y+1<h){
3049             for(x=0; x<15; x++){
3050                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3051                              - s1[x+1] + s1[x+1+stride])
3052                         -FFABS(  s2[x  ] - s2[x  +stride]
3053                              - s2[x+1] + s2[x+1+stride]);
3054             }
3055         }
3056         s1+= stride;
3057         s2+= stride;
3058     }
3059
3060     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3061     else  return score1 + FFABS(score2)*8;
3062 }
3063
3064 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3065     MpegEncContext *c = v;
3066     int score1=0;
3067     int score2=0;
3068     int x,y;
3069
3070     for(y=0; y<h; y++){
3071         for(x=0; x<8; x++){
3072             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3073         }
3074         if(y+1<h){
3075             for(x=0; x<7; x++){
3076                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3077                              - s1[x+1] + s1[x+1+stride])
3078                         -FFABS(  s2[x  ] - s2[x  +stride]
3079                              - s2[x+1] + s2[x+1+stride]);
3080             }
3081         }
3082         s1+= stride;
3083         s2+= stride;
3084     }
3085
3086     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3087     else  return score1 + FFABS(score2)*8;
3088 }
3089
3090 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3091     int i;
3092     unsigned int sum=0;
3093
3094     for(i=0; i<8*8; i++){
3095         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3096         int w= weight[i];
3097         b>>= RECON_SHIFT;
3098         assert(-512<b && b<512);
3099
3100         sum += (w*b)*(w*b)>>4;
3101     }
3102     return sum>>2;
3103 }
3104
3105 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3106     int i;
3107
3108     for(i=0; i<8*8; i++){
3109         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3110     }
3111 }
3112
3113 /**
3114  * permutes an 8x8 block.
3115  * @param block the block which will be permuted according to the given permutation vector
3116  * @param permutation the permutation vector
3117  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3118  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3119  *                  (inverse) permutated to scantable order!
3120  */
3121 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3122 {
3123     int i;
3124     DCTELEM temp[64];
3125
3126     if(last<=0) return;
3127     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3128
3129     for(i=0; i<=last; i++){
3130         const int j= scantable[i];
3131         temp[j]= block[j];
3132         block[j]=0;
3133     }
3134
3135     for(i=0; i<=last; i++){
3136         const int j= scantable[i];
3137         const int perm_j= permutation[j];
3138         block[perm_j]= temp[j];
3139     }
3140 }
3141
3142 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3143     return 0;
3144 }
3145
3146 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3147     int i;
3148
3149     memset(cmp, 0, sizeof(void*)*6);
3150
3151     for(i=0; i<6; i++){
3152         switch(type&0xFF){
3153         case FF_CMP_SAD:
3154             cmp[i]= c->sad[i];
3155             break;
3156         case FF_CMP_SATD:
3157             cmp[i]= c->hadamard8_diff[i];
3158             break;
3159         case FF_CMP_SSE:
3160             cmp[i]= c->sse[i];
3161             break;
3162         case FF_CMP_DCT:
3163             cmp[i]= c->dct_sad[i];
3164             break;
3165         case FF_CMP_DCT264:
3166             cmp[i]= c->dct264_sad[i];
3167             break;
3168         case FF_CMP_DCTMAX:
3169             cmp[i]= c->dct_max[i];
3170             break;
3171         case FF_CMP_PSNR:
3172             cmp[i]= c->quant_psnr[i];
3173             break;
3174         case FF_CMP_BIT:
3175             cmp[i]= c->bit[i];
3176             break;
3177         case FF_CMP_RD:
3178             cmp[i]= c->rd[i];
3179             break;
3180         case FF_CMP_VSAD:
3181             cmp[i]= c->vsad[i];
3182             break;
3183         case FF_CMP_VSSE:
3184             cmp[i]= c->vsse[i];
3185             break;
3186         case FF_CMP_ZERO:
3187             cmp[i]= zero_cmp;
3188             break;
3189         case FF_CMP_NSSE:
3190             cmp[i]= c->nsse[i];
3191             break;
3192 #if CONFIG_DWT
3193         case FF_CMP_W53:
3194             cmp[i]= c->w53[i];
3195             break;
3196         case FF_CMP_W97:
3197             cmp[i]= c->w97[i];
3198             break;
3199 #endif
3200         default:
3201             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3202         }
3203     }
3204 }
3205
3206 static void clear_block_c(DCTELEM *block)
3207 {
3208     memset(block, 0, sizeof(DCTELEM)*64);
3209 }
3210
3211 /**
3212  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3213  */
3214 static void clear_blocks_c(DCTELEM *blocks)
3215 {
3216     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3217 }
3218
3219 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3220     long i;
3221     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3222         long a = *(long*)(src+i);
3223         long b = *(long*)(dst+i);
3224         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3225     }
3226     for(; i<w; i++)
3227         dst[i+0] += src[i+0];
3228 }
3229
3230 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3231     long i;
3232     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3233         long a = *(long*)(src1+i);
3234         long b = *(long*)(src2+i);
3235         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3236     }
3237     for(; i<w; i++)
3238         dst[i] = src1[i]+src2[i];
3239 }
3240
3241 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3242     long i;
3243 #if !HAVE_FAST_UNALIGNED
3244     if((long)src2 & (sizeof(long)-1)){
3245         for(i=0; i+7<w; i+=8){
3246             dst[i+0] = src1[i+0]-src2[i+0];
3247             dst[i+1] = src1[i+1]-src2[i+1];
3248             dst[i+2] = src1[i+2]-src2[i+2];
3249             dst[i+3] = src1[i+3]-src2[i+3];
3250             dst[i+4] = src1[i+4]-src2[i+4];
3251             dst[i+5] = src1[i+5]-src2[i+5];
3252             dst[i+6] = src1[i+6]-src2[i+6];
3253             dst[i+7] = src1[i+7]-src2[i+7];
3254         }
3255     }else
3256 #endif
3257     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3258         long a = *(long*)(src1+i);
3259         long b = *(long*)(src2+i);
3260         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3261     }
3262     for(; i<w; i++)
3263         dst[i+0] = src1[i+0]-src2[i+0];
3264 }
3265
3266 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3267     int i;
3268     uint8_t l, lt;
3269
3270     l= *left;
3271     lt= *left_top;
3272
3273     for(i=0; i<w; i++){
3274         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3275         lt= src1[i];
3276         dst[i]= l;
3277     }
3278
3279     *left= l;
3280     *left_top= lt;
3281 }
3282
3283 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3284     int i;
3285     uint8_t l, lt;
3286
3287     l= *left;
3288     lt= *left_top;
3289
3290     for(i=0; i<w; i++){
3291         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3292         lt= src1[i];
3293         l= src2[i];
3294         dst[i]= l - pred;
3295     }
3296
3297     *left= l;
3298     *left_top= lt;
3299 }
3300
3301 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3302     int i;
3303
3304     for(i=0; i<w-1; i++){
3305         acc+= src[i];
3306         dst[i]= acc;
3307         i++;
3308         acc+= src[i];
3309         dst[i]= acc;
3310     }
3311
3312     for(; i<w; i++){
3313         acc+= src[i];
3314         dst[i]= acc;
3315     }
3316
3317     return acc;
3318 }
3319
3320 #if HAVE_BIGENDIAN
3321 #define B 3
3322 #define G 2
3323 #define R 1
3324 #define A 0
3325 #else
3326 #define B 0
3327 #define G 1
3328 #define R 2
3329 #define A 3
3330 #endif
3331 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3332     int i;
3333     int r,g,b,a;
3334     r= *red;
3335     g= *green;
3336     b= *blue;
3337     a= *alpha;
3338
3339     for(i=0; i<w; i++){
3340         b+= src[4*i+B];
3341         g+= src[4*i+G];
3342         r+= src[4*i+R];
3343         a+= src[4*i+A];
3344
3345         dst[4*i+B]= b;
3346         dst[4*i+G]= g;
3347         dst[4*i+R]= r;
3348         dst[4*i+A]= a;
3349     }
3350
3351     *red= r;
3352     *green= g;
3353     *blue= b;
3354     *alpha= a;
3355 }
3356 #undef B
3357 #undef G
3358 #undef R
3359 #undef A
3360
3361 #define BUTTERFLY2(o1,o2,i1,i2) \
3362 o1= (i1)+(i2);\
3363 o2= (i1)-(i2);
3364
3365 #define BUTTERFLY1(x,y) \
3366 {\
3367     int a,b;\
3368     a= x;\
3369     b= y;\
3370     x= a+b;\
3371     y= a-b;\
3372 }
3373
3374 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3375
3376 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3377     int i;
3378     int temp[64];
3379     int sum=0;
3380
3381     assert(h==8);
3382
3383     for(i=0; i<8; i++){
3384         //FIXME try pointer walks
3385         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3386         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3387         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3388         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3389
3390         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3391         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3392         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3393         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3394
3395         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3396         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3397         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3398         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3399     }
3400
3401     for(i=0; i<8; i++){
3402         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3403         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3404         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3405         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3406
3407         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3408         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3409         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3410         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3411
3412         sum +=
3413              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3414             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3415             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3416             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3417     }
3418 #if 0
3419 static int maxi=0;
3420 if(sum>maxi){
3421     maxi=sum;
3422     printf("MAX:%d\n", maxi);
3423 }
3424 #endif
3425     return sum;
3426 }
3427
3428 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3429     int i;
3430     int temp[64];
3431     int sum=0;
3432
3433     assert(h==8);
3434
3435     for(i=0; i<8; i++){
3436         //FIXME try pointer walks
3437         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3438         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3439         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3440         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3441
3442         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3443         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3444         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3445         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3446
3447         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3448         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3449         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3450         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3451     }
3452
3453     for(i=0; i<8; i++){
3454         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3455         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3456         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3457         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3458
3459         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3460         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3461         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3462         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3463
3464         sum +=
3465              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3466             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3467             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3468             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3469     }
3470
3471     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3472
3473     return sum;
3474 }
3475
3476 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3477     MpegEncContext * const s= (MpegEncContext *)c;
3478     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3479
3480     assert(h==8);
3481
3482     s->dsp.diff_pixels(temp, src1, src2, stride);
3483     s->dsp.fdct(temp);
3484     return s->dsp.sum_abs_dctelem(temp);
3485 }
3486
3487 #if CONFIG_GPL
3488 #define DCT8_1D {\
3489     const int s07 = SRC(0) + SRC(7);\
3490     const int s16 = SRC(1) + SRC(6);\
3491     const int s25 = SRC(2) + SRC(5);\
3492     const int s34 = SRC(3) + SRC(4);\
3493     const int a0 = s07 + s34;\
3494     const int a1 = s16 + s25;\
3495     const int a2 = s07 - s34;\
3496     const int a3 = s16 - s25;\
3497     const int d07 = SRC(0) - SRC(7);\
3498     const int d16 = SRC(1) - SRC(6);\
3499     const int d25 = SRC(2) - SRC(5);\
3500     const int d34 = SRC(3) - SRC(4);\
3501     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3502     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3503     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3504     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3505     DST(0,  a0 + a1     ) ;\
3506     DST(1,  a4 + (a7>>2)) ;\
3507     DST(2,  a2 + (a3>>1)) ;\
3508     DST(3,  a5 + (a6>>2)) ;\
3509     DST(4,  a0 - a1     ) ;\
3510     DST(5,  a6 - (a5>>2)) ;\
3511     DST(6, (a2>>1) - a3 ) ;\
3512     DST(7, (a4>>2) - a7 ) ;\
3513 }
3514
3515 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3516     MpegEncContext * const s= (MpegEncContext *)c;
3517     DCTELEM dct[8][8];
3518     int i;
3519     int sum=0;
3520
3521     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3522
3523 #define SRC(x) dct[i][x]
3524 #define DST(x,v) dct[i][x]= v
3525     for( i = 0; i < 8; i++ )
3526         DCT8_1D
3527 #undef SRC
3528 #undef DST
3529
3530 #define SRC(x) dct[x][i]
3531 #define DST(x,v) sum += FFABS(v)
3532     for( i = 0; i < 8; i++ )
3533         DCT8_1D
3534 #undef SRC
3535 #undef DST
3536     return sum;
3537 }
3538 #endif
3539
3540 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3541     MpegEncContext * const s= (MpegEncContext *)c;
3542     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3543     int sum=0, i;
3544
3545     assert(h==8);
3546
3547     s->dsp.diff_pixels(temp, src1, src2, stride);
3548     s->dsp.fdct(temp);
3549
3550     for(i=0; i<64; i++)
3551         sum= FFMAX(sum, FFABS(temp[i]));
3552
3553     return sum;
3554 }
3555
3556 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3557     MpegEncContext * const s= (MpegEncContext *)c;
3558     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3559     DCTELEM * const bak = temp+64;
3560     int sum=0, i;
3561
3562     assert(h==8);
3563     s->mb_intra=0;
3564
3565     s->dsp.diff_pixels(temp, src1, src2, stride);
3566
3567     memcpy(bak, temp, 64*sizeof(DCTELEM));
3568
3569     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3570     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3571     ff_simple_idct(temp); //FIXME
3572
3573     for(i=0; i<64; i++)
3574         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3575
3576     return sum;
3577 }
3578
3579 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3580     MpegEncContext * const s= (MpegEncContext *)c;
3581     const uint8_t *scantable= s->intra_scantable.permutated;
3582     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3583     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3584     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3585     int i, last, run, bits, level, distortion, start_i;
3586     const int esc_length= s->ac_esc_length;
3587     uint8_t * length;
3588     uint8_t * last_length;
3589
3590     assert(h==8);
3591
3592     copy_block8(lsrc1, src1, 8, stride, 8);
3593     copy_block8(lsrc2, src2, 8, stride, 8);
3594
3595     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3596
3597     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3598
3599     bits=0;
3600
3601     if (s->mb_intra) {
3602         start_i = 1;
3603         length     = s->intra_ac_vlc_length;
3604         last_length= s->intra_ac_vlc_last_length;
3605         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3606     } else {
3607         start_i = 0;
3608         length     = s->inter_ac_vlc_length;
3609         last_length= s->inter_ac_vlc_last_length;
3610     }
3611
3612     if(last>=start_i){
3613         run=0;
3614         for(i=start_i; i<last; i++){
3615             int j= scantable[i];
3616             level= temp[j];
3617
3618             if(level){
3619                 level+=64;
3620                 if((level&(~127)) == 0){
3621                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3622                 }else
3623                     bits+= esc_length;
3624                 run=0;
3625             }else
3626                 run++;
3627         }
3628         i= scantable[last];
3629
3630         level= temp[i] + 64;
3631
3632         assert(level - 64);
3633
3634         if((level&(~127)) == 0){
3635             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636         }else
3637             bits+= esc_length;
3638
3639     }
3640
3641     if(last>=0){
3642         if(s->mb_intra)
3643             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3644         else
3645             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3646     }
3647
3648     s->dsp.idct_add(lsrc2, 8, temp);
3649
3650     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3651
3652     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3653 }
3654
3655 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3656     MpegEncContext * const s= (MpegEncContext *)c;
3657     const uint8_t *scantable= s->intra_scantable.permutated;
3658     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3659     int i, last, run, bits, level, start_i;
3660     const int esc_length= s->ac_esc_length;
3661     uint8_t * length;
3662     uint8_t * last_length;
3663
3664     assert(h==8);
3665
3666     s->dsp.diff_pixels(temp, src1, src2, stride);
3667
3668     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3669
3670     bits=0;
3671
3672     if (s->mb_intra) {
3673         start_i = 1;
3674         length     = s->intra_ac_vlc_length;
3675         last_length= s->intra_ac_vlc_last_length;
3676         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3677     } else {
3678         start_i = 0;
3679         length     = s->inter_ac_vlc_length;
3680         last_length= s->inter_ac_vlc_last_length;
3681     }
3682
3683     if(last>=start_i){
3684         run=0;
3685         for(i=start_i; i<last; i++){
3686             int j= scantable[i];
3687             level= temp[j];
3688
3689             if(level){
3690                 level+=64;
3691                 if((level&(~127)) == 0){
3692                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3693                 }else
3694                     bits+= esc_length;
3695                 run=0;
3696             }else
3697                 run++;
3698         }
3699         i= scantable[last];
3700
3701         level= temp[i] + 64;
3702
3703         assert(level - 64);
3704
3705         if((level&(~127)) == 0){
3706             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3707         }else
3708             bits+= esc_length;
3709     }
3710
3711     return bits;
3712 }
3713
3714 #define VSAD_INTRA(size) \
3715 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3716     int score=0;                                                                                            \
3717     int x,y;                                                                                                \
3718                                                                                                             \
3719     for(y=1; y<h; y++){                                                                                     \
3720         for(x=0; x<size; x+=4){                                                                             \
3721             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3722                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3723         }                                                                                                   \
3724         s+= stride;                                                                                         \
3725     }                                                                                                       \
3726                                                                                                             \
3727     return score;                                                                                           \
3728 }
3729 VSAD_INTRA(8)
3730 VSAD_INTRA(16)
3731
3732 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3733     int score=0;
3734     int x,y;
3735
3736     for(y=1; y<h; y++){
3737         for(x=0; x<16; x++){
3738             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3739         }
3740         s1+= stride;
3741         s2+= stride;
3742     }
3743
3744     return score;
3745 }
3746
3747 #define SQ(a) ((a)*(a))
3748 #define VSSE_INTRA(size) \
3749 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3750     int score=0;                                                                                            \
3751     int x,y;                                                                                                \
3752                                                                                                             \
3753     for(y=1; y<h; y++){                                                                                     \
3754         for(x=0; x<size; x+=4){                                                                               \
3755             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3756                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3757         }                                                                                                   \
3758         s+= stride;                                                                                         \
3759     }                                                                                                       \
3760                                                                                                             \
3761     return score;                                                                                           \
3762 }
3763 VSSE_INTRA(8)
3764 VSSE_INTRA(16)
3765
3766 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3767     int score=0;
3768     int x,y;
3769
3770     for(y=1; y<h; y++){
3771         for(x=0; x<16; x++){
3772             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3773         }
3774         s1+= stride;
3775         s2+= stride;
3776     }
3777
3778     return score;
3779 }
3780
3781 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3782                                int size){
3783     int score=0;
3784     int i;
3785     for(i=0; i<size; i++)
3786         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3787     return score;
3788 }
3789
3790 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3791 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3792 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3793 #if CONFIG_GPL
3794 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3795 #endif
3796 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3797 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3798 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3799 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3800
3801 #endif /* 0 */
3802
3803 static void vector_fmul_c(float *dst, const float *src, int len){
3804     int i;
3805     for(i=0; i<len; i++)
3806         dst[i] *= src[i];
3807 }
3808
3809 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3810     int i;
3811     src1 += len-1;
3812     for(i=0; i<len; i++)
3813         dst[i] = src0[i] * src1[-i];
3814 }
3815
3816 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3817     int i;
3818     for(i=0; i<len; i++)
3819         dst[i] = src0[i] * src1[i] + src2[i];
3820 }
3821
3822 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3823     int i,j;
3824     dst += len;
3825     win += len;
3826     src0+= len;
3827     for(i=-len, j=len-1; i<0; i++, j--) {
3828         float s0 = src0[i];
3829         float s1 = src1[j];
3830         float wi = win[i];
3831         float wj = win[j];
3832         dst[i] = s0*wj - s1*wi + add_bias;
3833         dst[j] = s0*wi + s1*wj + add_bias;
3834     }
3835 }
3836
3837 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3838                                  int len)
3839 {
3840     int i;
3841     for (i = 0; i < len; i++)
3842         dst[i] = src[i] * mul;
3843 }
3844 #if 0
3845 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3846                                       const float **sv, float mul, int len)
3847 {
3848     int i;
3849     for (i = 0; i < len; i += 2, sv++) {
3850         dst[i  ] = src[i  ] * sv[0][0] * mul;
3851         dst[i+1] = src[i+1] * sv[0][1] * mul;
3852     }
3853 }
3854
3855 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3856                                       const float **sv, float mul, int len)
3857 {
3858     int i;
3859     for (i = 0; i < len; i += 4, sv++) {
3860         dst[i  ] = src[i  ] * sv[0][0] * mul;
3861         dst[i+1] = src[i+1] * sv[0][1] * mul;
3862         dst[i+2] = src[i+2] * sv[0][2] * mul;
3863         dst[i+3] = src[i+3] * sv[0][3] * mul;
3864     }
3865 }
3866
3867 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3868                                int len)
3869 {
3870     int i;
3871     for (i = 0; i < len; i += 2, sv++) {
3872         dst[i  ] = sv[0][0] * mul;
3873         dst[i+1] = sv[0][1] * mul;
3874     }
3875 }
3876
3877 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3878                                int len)
3879 {
3880     int i;
3881     for (i = 0; i < len; i += 4, sv++) {
3882         dst[i  ] = sv[0][0] * mul;
3883         dst[i+1] = sv[0][1] * mul;
3884         dst[i+2] = sv[0][2] * mul;
3885         dst[i+3] = sv[0][3] * mul;
3886     }
3887 }
3888
3889 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3890                                 int len)
3891 {
3892     int i;
3893     for (i = 0; i < len; i++) {
3894         float t = v1[i] - v2[i];
3895         v1[i] += v2[i];
3896         v2[i] = t;
3897     }
3898 }
3899
3900 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3901 {
3902     float p = 0.0;
3903     int i;
3904
3905     for (i = 0; i < len; i++)
3906         p += v1[i] * v2[i];
3907
3908     return p;
3909 }
3910
3911 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3912     int i;
3913     for(i=0; i<len; i++)
3914         dst[i] = src[i] * mul;
3915 }
3916
3917
3918 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3919                    uint32_t maxi, uint32_t maxisign)
3920 {
3921
3922     if(a > mini) return mini;
3923     else if((a^(1<<31)) > maxisign) return maxi;
3924     else return a;
3925 }
3926
3927 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3928     int i;
3929     uint32_t mini = *(uint32_t*)min;
3930     uint32_t maxi = *(uint32_t*)max;
3931     uint32_t maxisign = maxi ^ (1<<31);
3932     uint32_t *dsti = (uint32_t*)dst;
3933     const uint32_t *srci = (const uint32_t*)src;
3934     for(i=0; i<len; i+=8) {
3935         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3936         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3937         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3938         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3939         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3940         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3941         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3942         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3943     }
3944 }
3945 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3946     int i;
3947     if(min < 0 && max > 0) {
3948         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3949     } else {
3950         for(i=0; i < len; i+=8) {
3951             dst[i    ] = av_clipf(src[i    ], min, max);
3952             dst[i + 1] = av_clipf(src[i + 1], min, max);
3953             dst[i + 2] = av_clipf(src[i + 2], min, max);
3954             dst[i + 3] = av_clipf(src[i + 3], min, max);
3955             dst[i + 4] = av_clipf(src[i + 4], min, max);
3956             dst[i + 5] = av_clipf(src[i + 5], min, max);
3957             dst[i + 6] = av_clipf(src[i + 6], min, max);
3958             dst[i + 7] = av_clipf(src[i + 7], min, max);
3959         }
3960     }
3961 }
3962
3963 static av_always_inline int float_to_int16_one(const float *src){
3964     int_fast32_t tmp = *(const int32_t*)src;
3965     if(tmp & 0xf0000){
3966         tmp = (0x43c0ffff - tmp)>>31;
3967         // is this faster on some gcc/cpu combinations?
3968 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3969 //      else                 tmp = 0;
3970     }
3971     return tmp - 0x8000;
3972 }
3973
3974 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3975     int i;
3976     for(i=0; i<len; i++)
3977         dst[i] = float_to_int16_one(src+i);
3978 }
3979
3980 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3981     int i,j,c;
3982     if(channels==2){
3983         for(i=0; i<len; i++){
3984             dst[2*i]   = float_to_int16_one(src[0]+i);
3985             dst[2*i+1] = float_to_int16_one(src[1]+i);
3986         }
3987     }else{
3988         for(c=0; c<channels; c++)
3989             for(i=0, j=c; i<len; i++, j+=channels)
3990                 dst[j] = float_to_int16_one(src[c]+i);
3991     }
3992 }
3993
3994 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3995 {
3996     int res = 0;
3997
3998     while (order--)
3999         res += (*v1++ * *v2++) >> shift;
4000
4001     return res;
4002 }
4003
4004 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4005 {
4006     int res = 0;
4007     while (order--) {
4008         res   += *v1 * *v2++;
4009         *v1++ += mul * *v3++;
4010     }
4011     return res;
4012 }
4013
4014 #define W0 2048
4015 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4016 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4017 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4018 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4019 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4020 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4021 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4022
4023 static void wmv2_idct_row(short * b)
4024 {
4025     int s1,s2;
4026     int a0,a1,a2,a3,a4,a5,a6,a7;
4027     /*step 1*/
4028     a1 = W1*b[1]+W7*b[7];
4029     a7 = W7*b[1]-W1*b[7];
4030     a5 = W5*b[5]+W3*b[3];
4031     a3 = W3*b[5]-W5*b[3];
4032     a2 = W2*b[2]+W6*b[6];
4033     a6 = W6*b[2]-W2*b[6];
4034     a0 = W0*b[0]+W0*b[4];
4035     a4 = W0*b[0]-W0*b[4];
4036     /*step 2*/
4037     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4038     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4039     /*step 3*/
4040     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4041     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4042     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4043     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4044     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4045     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4046     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4047     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4048 }
4049 static void wmv2_idct_col(short * b)
4050 {
4051     int s1,s2;
4052     int a0,a1,a2,a3,a4,a5,a6,a7;
4053     /*step 1, with extended precision*/
4054     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4055     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4056     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4057     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4058     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4059     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4060     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4061     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4062     /*step 2*/
4063     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4064     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4065     /*step 3*/
4066     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4067     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4068     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4069     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4070
4071     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4072     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4073     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4074     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4075 }
4076 void ff_wmv2_idct_c(short * block){
4077     int i;
4078
4079     for(i=0;i<64;i+=8){
4080         wmv2_idct_row(block+i);
4081     }
4082     for(i=0;i<8;i++){
4083         wmv2_idct_col(block+i);
4084     }
4085 }
4086 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4087  converted */
4088 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4089 {
4090     ff_wmv2_idct_c(block);
4091     put_pixels_clamped_c(block, dest, line_size);
4092 }
4093 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4094 {
4095     ff_wmv2_idct_c(block);
4096     add_pixels_clamped_c(block, dest, line_size);
4097 }
4098 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4099 {
4100     j_rev_dct (block);
4101     put_pixels_clamped_c(block, dest, line_size);
4102 }
4103 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4104 {
4105     j_rev_dct (block);
4106     add_pixels_clamped_c(block, dest, line_size);
4107 }
4108
4109 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4110 {
4111     j_rev_dct4 (block);
4112     put_pixels_clamped4_c(block, dest, line_size);
4113 }
4114 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4115 {
4116     j_rev_dct4 (block);
4117     add_pixels_clamped4_c(block, dest, line_size);
4118 }
4119
4120 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4121 {
4122     j_rev_dct2 (block);
4123     put_pixels_clamped2_c(block, dest, line_size);
4124 }
4125 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4126 {
4127     j_rev_dct2 (block);
4128     add_pixels_clamped2_c(block, dest, line_size);
4129 }
4130
4131 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4132 {
4133     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4134
4135     dest[0] = cm[(block[0] + 4)>>3];
4136 }
4137 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4138 {
4139     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4140
4141     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4142 }
4143
4144 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4145
4146 /* init static data */
4147 av_cold void dsputil_static_init(void)
4148 {
4149     int i;
4150
4151     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4152     for(i=0;i<MAX_NEG_CROP;i++) {
4153         ff_cropTbl[i] = 0;
4154         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4155     }
4156
4157     for(i=0;i<512;i++) {
4158         ff_squareTbl[i] = (i - 256) * (i - 256);
4159     }
4160
4161     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4162 }
4163
4164 int ff_check_alignment(void){
4165     static int did_fail=0;
4166     DECLARE_ALIGNED(16, int, aligned);
4167
4168     if((intptr_t)&aligned & 15){
4169         if(!did_fail){
4170 #if HAVE_MMX || HAVE_ALTIVEC
4171             av_log(NULL, AV_LOG_ERROR,
4172                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4173                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4174                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4175                 "Do not report crashes to FFmpeg developers.\n");
4176 #endif
4177             did_fail=1;
4178         }
4179         return -1;
4180     }
4181     return 0;
4182 }
4183 #endif
4184 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4185 {
4186 #if 0
4187     int i;
4188
4189     ff_check_alignment();
4190
4191 #if CONFIG_ENCODERS
4192     if(avctx->dct_algo==FF_DCT_FASTINT) {
4193         c->fdct = fdct_ifast;
4194         c->fdct248 = fdct_ifast248;
4195     }
4196     else if(avctx->dct_algo==FF_DCT_FAAN) {
4197         c->fdct = ff_faandct;
4198         c->fdct248 = ff_faandct248;
4199     }
4200     else {
4201         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4202         c->fdct248 = ff_fdct248_islow;
4203     }
4204 #endif //CONFIG_ENCODERS
4205
4206     if(avctx->lowres==1){
4207         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4208             c->idct_put= ff_jref_idct4_put;
4209             c->idct_add= ff_jref_idct4_add;
4210         }else{
4211             c->idct_put= ff_h264_lowres_idct_put_c;
4212             c->idct_add= ff_h264_lowres_idct_add_c;
4213         }
4214         c->idct    = j_rev_dct4;
4215         c->idct_permutation_type= FF_NO_IDCT_PERM;
4216     }else if(avctx->lowres==2){
4217         c->idct_put= ff_jref_idct2_put;
4218         c->idct_add= ff_jref_idct2_add;
4219         c->idct    = j_rev_dct2;
4220         c->idct_permutation_type= FF_NO_IDCT_PERM;
4221     }else if(avctx->lowres==3){
4222         c->idct_put= ff_jref_idct1_put;
4223         c->idct_add= ff_jref_idct1_add;
4224         c->idct    = j_rev_dct1;
4225         c->idct_permutation_type= FF_NO_IDCT_PERM;
4226     }else{
4227         if(avctx->idct_algo==FF_IDCT_INT){
4228             c->idct_put= ff_jref_idct_put;
4229             c->idct_add= ff_jref_idct_add;
4230             c->idct    = j_rev_dct;
4231             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4232         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4233                 avctx->idct_algo==FF_IDCT_VP3){
4234             c->idct_put= ff_vp3_idct_put_c;
4235             c->idct_add= ff_vp3_idct_add_c;
4236             c->idct    = ff_vp3_idct_c;
4237             c->idct_permutation_type= FF_NO_IDCT_PERM;
4238         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4239             c->idct_put= ff_wmv2_idct_put_c;
4240             c->idct_add= ff_wmv2_idct_add_c;
4241             c->idct    = ff_wmv2_idct_c;
4242             c->idct_permutation_type= FF_NO_IDCT_PERM;
4243         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4244             c->idct_put= ff_faanidct_put;
4245             c->idct_add= ff_faanidct_add;
4246             c->idct    = ff_faanidct;
4247             c->idct_permutation_type= FF_NO_IDCT_PERM;
4248         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4249             c->idct_put= ff_ea_idct_put_c;
4250             c->idct_permutation_type= FF_NO_IDCT_PERM;
4251         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4252             c->idct     = ff_bink_idct_c;
4253             c->idct_add = ff_bink_idct_add_c;
4254             c->idct_put = ff_bink_idct_put_c;
4255             c->idct_permutation_type = FF_NO_IDCT_PERM;
4256         }else{ //accurate/default
4257             c->idct_put= ff_simple_idct_put;
4258             c->idct_add= ff_simple_idct_add;
4259             c->idct    = ff_simple_idct;
4260             c->idct_permutation_type= FF_NO_IDCT_PERM;
4261         }
4262     }
4263
4264     c->get_pixels = get_pixels_c;
4265     c->diff_pixels = diff_pixels_c;
4266     c->put_pixels_clamped = put_pixels_clamped_c;
4267     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4268     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4269     c->add_pixels_clamped = add_pixels_clamped_c;
4270     c->add_pixels8 = add_pixels8_c;
4271     c->add_pixels4 = add_pixels4_c;
4272     c->sum_abs_dctelem = sum_abs_dctelem_c;
4273     c->gmc1 = gmc1_c;
4274     c->gmc = ff_gmc_c;
4275     c->clear_block = clear_block_c;
4276     c->clear_blocks = clear_blocks_c;
4277     c->pix_sum = pix_sum_c;
4278     c->pix_norm1 = pix_norm1_c;
4279
4280     c->fill_block_tab[0] = fill_block16_c;
4281     c->fill_block_tab[1] = fill_block8_c;
4282     c->scale_block = scale_block_c;
4283
4284     /* TODO [0] 16  [1] 8 */
4285     c->pix_abs[0][0] = pix_abs16_c;
4286     c->pix_abs[0][1] = pix_abs16_x2_c;
4287     c->pix_abs[0][2] = pix_abs16_y2_c;
4288     c->pix_abs[0][3] = pix_abs16_xy2_c;
4289     c->pix_abs[1][0] = pix_abs8_c;
4290     c->pix_abs[1][1] = pix_abs8_x2_c;
4291     c->pix_abs[1][2] = pix_abs8_y2_c;
4292     c->pix_abs[1][3] = pix_abs8_xy2_c;
4293
4294 #define dspfunc(PFX, IDX, NUM) \
4295     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4296     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4297     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4298     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4299
4300     dspfunc(put, 0, 16);
4301     dspfunc(put_no_rnd, 0, 16);
4302     dspfunc(put, 1, 8);
4303     dspfunc(put_no_rnd, 1, 8);
4304     dspfunc(put, 2, 4);
4305     dspfunc(put, 3, 2);
4306
4307     dspfunc(avg, 0, 16);
4308     dspfunc(avg_no_rnd, 0, 16);
4309     dspfunc(avg, 1, 8);
4310     dspfunc(avg_no_rnd, 1, 8);
4311     dspfunc(avg, 2, 4);
4312     dspfunc(avg, 3, 2);
4313 #undef dspfunc
4314
4315     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4316     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4317
4318     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4319     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4320     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4321     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4322     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4323     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4324     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4325     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4326     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4327
4328     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4329     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4330     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4331     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4332     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4333     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4334     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4335     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4336     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4337
4338 #define dspfunc(PFX, IDX, NUM) \
4339     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4340     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4341     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4342     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4343     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4344     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4345     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4346     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4347     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4348     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4349     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4350     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4351     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4352     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4353     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4354     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4355
4356     dspfunc(put_qpel, 0, 16);
4357     dspfunc(put_no_rnd_qpel, 0, 16);
4358
4359     dspfunc(avg_qpel, 0, 16);
4360     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4361
4362     dspfunc(put_qpel, 1, 8);
4363     dspfunc(put_no_rnd_qpel, 1, 8);
4364
4365     dspfunc(avg_qpel, 1, 8);
4366     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4367
4368     dspfunc(put_h264_qpel, 0, 16);
4369     dspfunc(put_h264_qpel, 1, 8);
4370     dspfunc(put_h264_qpel, 2, 4);
4371     dspfunc(put_h264_qpel, 3, 2);
4372     dspfunc(avg_h264_qpel, 0, 16);
4373     dspfunc(avg_h264_qpel, 1, 8);
4374     dspfunc(avg_h264_qpel, 2, 4);
4375
4376 #undef dspfunc
4377     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4378     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4379     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4380     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4381     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4382     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4383     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4384     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4385
4386     c->draw_edges = draw_edges_c;
4387
4388 #if CONFIG_CAVS_DECODER
4389     ff_cavsdsp_init(c,avctx);
4390 #endif
4391
4392 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4393     ff_mlp_init(c, avctx);
4394 #endif
4395 #if CONFIG_VC1_DECODER
4396     ff_vc1dsp_init(c,avctx);
4397 #endif
4398 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4399     ff_intrax8dsp_init(c,avctx);
4400 #endif
4401 #if CONFIG_RV30_DECODER
4402     ff_rv30dsp_init(c,avctx);
4403 #endif
4404 #if CONFIG_RV40_DECODER
4405     ff_rv40dsp_init(c,avctx);
4406     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4407     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4408     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4409     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4410 #endif
4411
4412     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4413     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4414     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4415     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4416     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4417     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4418     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4419     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4420
4421 #define SET_CMP_FUNC(name) \
4422     c->name[0]= name ## 16_c;\
4423     c->name[1]= name ## 8x8_c;
4424
4425     SET_CMP_FUNC(hadamard8_diff)
4426     c->hadamard8_diff[4]= hadamard8_intra16_c;
4427     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4428     SET_CMP_FUNC(dct_sad)
4429     SET_CMP_FUNC(dct_max)
4430 #if CONFIG_GPL
4431     SET_CMP_FUNC(dct264_sad)
4432 #endif
4433     c->sad[0]= pix_abs16_c;
4434     c->sad[1]= pix_abs8_c;
4435     c->sse[0]= sse16_c;
4436     c->sse[1]= sse8_c;
4437     c->sse[2]= sse4_c;
4438     SET_CMP_FUNC(quant_psnr)
4439     SET_CMP_FUNC(rd)
4440     SET_CMP_FUNC(bit)
4441     c->vsad[0]= vsad16_c;
4442     c->vsad[4]= vsad_intra16_c;
4443     c->vsad[5]= vsad_intra8_c;
4444     c->vsse[0]= vsse16_c;
4445     c->vsse[4]= vsse_intra16_c;
4446     c->vsse[5]= vsse_intra8_c;
4447     c->nsse[0]= nsse16_c;
4448     c->nsse[1]= nsse8_c;
4449 #if CONFIG_DWT
4450     ff_dsputil_init_dwt(c);
4451 #endif
4452
4453     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4454
4455     c->add_bytes= add_bytes_c;
4456     c->add_bytes_l2= add_bytes_l2_c;
4457     c->diff_bytes= diff_bytes_c;
4458     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4459     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4460     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4461     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4462     c->bswap_buf= bswap_buf;
4463 #if CONFIG_PNG_DECODER
4464     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4465 #endif
4466
4467     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4468         c->h263_h_loop_filter= h263_h_loop_filter_c;
4469         c->h263_v_loop_filter= h263_v_loop_filter_c;
4470     }
4471
4472     if (CONFIG_VP3_DECODER) {
4473         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4474         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4475     }
4476     if (CONFIG_VP6_DECODER) {
4477         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4478     }
4479
4480     c->h261_loop_filter= h261_loop_filter_c;
4481
4482     c->try_8x8basis= try_8x8basis_c;
4483     c->add_8x8basis= add_8x8basis_c;
4484
4485 #if CONFIG_VORBIS_DECODER
4486     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4487 #endif
4488 #if CONFIG_AC3_DECODER
4489     c->ac3_downmix = ff_ac3_downmix_c;
4490 #endif
4491 #if CONFIG_LPC
4492     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4493 #endif
4494
4495 #endif /* 0 */
4496     c->vector_fmul = vector_fmul_c;
4497     c->vector_fmul_reverse = vector_fmul_reverse_c;
4498     c->vector_fmul_add = vector_fmul_add_c;
4499     c->vector_fmul_window = ff_vector_fmul_window_c;
4500     //c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4501     //c->vector_clipf = vector_clipf_c;
4502     //c->float_to_int16 = ff_float_to_int16_c;
4503     //c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4504     //c->scalarproduct_int16 = scalarproduct_int16_c;
4505     //c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4506     //c->scalarproduct_float = scalarproduct_float_c;
4507     //c->butterflies_float = butterflies_float_c;
4508     c->vector_fmul_scalar = vector_fmul_scalar_c;
4509 #if 0
4510     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4511     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4512
4513     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4514     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4515
4516     c->shrink[0]= ff_img_copy_plane;
4517     c->shrink[1]= ff_shrink22;
4518     c->shrink[2]= ff_shrink44;
4519     c->shrink[3]= ff_shrink88;
4520
4521     c->prefetch= just_return;
4522
4523     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4524     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4525
4526     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4527     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4528     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4529     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4530     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4531     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4532     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4533     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4534     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4535
4536     for(i=0; i<64; i++){
4537         if(!c->put_2tap_qpel_pixels_tab[0][i])
4538             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4539         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4540             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4541     }
4542
4543     switch(c->idct_permutation_type){
4544     case FF_NO_IDCT_PERM:
4545         for(i=0; i<64; i++)
4546             c->idct_permutation[i]= i;
4547         break;
4548     case FF_LIBMPEG2_IDCT_PERM:
4549         for(i=0; i<64; i++)
4550             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4551         break;
4552     case FF_SIMPLE_IDCT_PERM:
4553         for(i=0; i<64; i++)
4554             c->idct_permutation[i]= simple_mmx_permutation[i];
4555         break;
4556     case FF_TRANSPOSE_IDCT_PERM:
4557         for(i=0; i<64; i++)
4558             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4559         break;
4560     case FF_PARTTRANS_IDCT_PERM:
4561         for(i=0; i<64; i++)
4562             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4563         break;
4564     case FF_SSE2_IDCT_PERM:
4565         for(i=0; i<64; i++)
4566             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4567         break;
4568     default:
4569         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4570     }
4571 #endif /* 0 */
4572 }