release/src/router/ffmpeg/libavcodec/dsputil.c

   1 /*
   2  * DSP utils
   3  * Copyright (c) 2000, 2001 Fabrice Bellard
   4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
   5  *
   6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
   7  *
   8  * This file is part of FFmpeg.
   9  *
  10  * FFmpeg is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * FFmpeg is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with FFmpeg; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 /**
  26  * @file
  27  * DSP utils
  28  */
  29
  30 #include "avcodec.h"
  31 #include "dsputil.h"
  32 #include "simple_idct.h"
  33 #include "faandct.h"
  34 #include "faanidct.h"
  35 #include "mathops.h"
  36 #include "mpegvideo.h"
  37 #include "config.h"
  38 #include "lpc.h"
  39 #include "ac3dec.h"
  40 #include "vorbis.h"
  41 #include "png.h"
  42
  43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
  44 uint32_t ff_squareTbl[512] = {0, };
  45
  46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
  47 #define pb_7f (~0UL/255 * 0x7f)
  48 #define pb_80 (~0UL/255 * 0x80)
  49
  50 const uint8_t ff_zigzag_direct[64] = {
  51     0,   1,  8, 16,  9,  2,  3, 10,
  52     17, 24, 32, 25, 18, 11,  4,  5,
  53     12, 19, 26, 33, 40, 48, 41, 34,
  54     27, 20, 13,  6,  7, 14, 21, 28,
  55     35, 42, 49, 56, 57, 50, 43, 36,
  56     29, 22, 15, 23, 30, 37, 44, 51,
  57     58, 59, 52, 45, 38, 31, 39, 46,
  58     53, 60, 61, 54, 47, 55, 62, 63
  59 };
  60
  61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
  62    specification, we interleave the fields */
  63 const uint8_t ff_zigzag248_direct[64] = {
  64      0,  8,  1,  9, 16, 24,  2, 10,
  65     17, 25, 32, 40, 48, 56, 33, 41,
  66     18, 26,  3, 11,  4, 12, 19, 27,
  67     34, 42, 49, 57, 50, 58, 35, 43,
  68     20, 28,  5, 13,  6, 14, 21, 29,
  69     36, 44, 51, 59, 52, 60, 37, 45,
  70     22, 30,  7, 15, 23, 31, 38, 46,
  71     53, 61, 54, 62, 39, 47, 55, 63,
  72 };
  73
  74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
  75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
  76
  77 const uint8_t ff_alternate_horizontal_scan[64] = {
  78     0,  1,   2,  3,  8,  9, 16, 17,
  79     10, 11,  4,  5,  6,  7, 15, 14,
  80     13, 12, 19, 18, 24, 25, 32, 33,
  81     26, 27, 20, 21, 22, 23, 28, 29,
  82     30, 31, 34, 35, 40, 41, 48, 49,
  83     42, 43, 36, 37, 38, 39, 44, 45,
  84     46, 47, 50, 51, 56, 57, 58, 59,
  85     52, 53, 54, 55, 60, 61, 62, 63,
  86 };
  87
  88 const uint8_t ff_alternate_vertical_scan[64] = {
  89     0,  8,  16, 24,  1,  9,  2, 10,
  90     17, 25, 32, 40, 48, 56, 57, 49,
  91     41, 33, 26, 18,  3, 11,  4, 12,
  92     19, 27, 34, 42, 50, 58, 35, 43,
  93     51, 59, 20, 28,  5, 13,  6, 14,
  94     21, 29, 36, 44, 52, 60, 37, 45,
  95     53, 61, 22, 30,  7, 15, 23, 31,
  96     38, 46, 54, 62, 39, 47, 55, 63,
  97 };
  98
  99 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
 100  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
 101 const uint32_t ff_inverse[257]={
 102          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
 103  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
 104  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
 105  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
 106  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
 107  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
 108   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
 109   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
 110   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
 111   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
 112   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
 113   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
 114   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
 115   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
 116   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
 117   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
 118   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
 119   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
 120   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
 121   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
 122   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
 123   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
 124   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
 125   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
 126   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
 127   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
 128   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
 129   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
 130   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
 131   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
 132   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
 133   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
 134   16777216
 135 };
 136
 137 /* Input permutation for the simple_idct_mmx */
 138 static const uint8_t simple_mmx_permutation[64]={
 139         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 140         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 141         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 142         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 143         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 144         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 145         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 146         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 147 };
 148
 149 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
 150
 151 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
 152     int i;
 153     int end;
 154
 155     st->scantable= src_scantable;
 156
 157     for(i=0; i<64; i++){
 158         int j;
 159         j = src_scantable[i];
 160         st->permutated[i] = permutation[j];
 161 #if ARCH_PPC
 162         st->inverse[j] = i;
 163 #endif
 164     }
 165
 166     end=-1;
 167     for(i=0; i<64; i++){
 168         int j;
 169         j = st->permutated[i];
 170         if(j>end) end=j;
 171         st->raster_end[i]= end;
 172     }
 173 }
 174
 175 static int pix_sum_c(uint8_t * pix, int line_size)
 176 {
 177     int s, i, j;
 178
 179     s = 0;
 180     for (i = 0; i < 16; i++) {
 181         for (j = 0; j < 16; j += 8) {
 182             s += pix[0];
 183             s += pix[1];
 184             s += pix[2];
 185             s += pix[3];
 186             s += pix[4];
 187             s += pix[5];
 188             s += pix[6];
 189             s += pix[7];
 190             pix += 8;
 191         }
 192         pix += line_size - 16;
 193     }
 194     return s;
 195 }
 196
 197 static int pix_norm1_c(uint8_t * pix, int line_size)
 198 {
 199     int s, i, j;
 200     uint32_t *sq = ff_squareTbl + 256;
 201
 202     s = 0;
 203     for (i = 0; i < 16; i++) {
 204         for (j = 0; j < 16; j += 8) {
 205 #if 0
 206             s += sq[pix[0]];
 207             s += sq[pix[1]];
 208             s += sq[pix[2]];
 209             s += sq[pix[3]];
 210             s += sq[pix[4]];
 211             s += sq[pix[5]];
 212             s += sq[pix[6]];
 213             s += sq[pix[7]];
 214 #else
 215 #if LONG_MAX > 2147483647
 216             register uint64_t x=*(uint64_t*)pix;
 217             s += sq[x&0xff];
 218             s += sq[(x>>8)&0xff];
 219             s += sq[(x>>16)&0xff];
 220             s += sq[(x>>24)&0xff];
 221             s += sq[(x>>32)&0xff];
 222             s += sq[(x>>40)&0xff];
 223             s += sq[(x>>48)&0xff];
 224             s += sq[(x>>56)&0xff];
 225 #else
 226             register uint32_t x=*(uint32_t*)pix;
 227             s += sq[x&0xff];
 228             s += sq[(x>>8)&0xff];
 229             s += sq[(x>>16)&0xff];
 230             s += sq[(x>>24)&0xff];
 231             x=*(uint32_t*)(pix+4);
 232             s += sq[x&0xff];
 233             s += sq[(x>>8)&0xff];
 234             s += sq[(x>>16)&0xff];
 235             s += sq[(x>>24)&0xff];
 236 #endif
 237 #endif
 238             pix += 8;
 239         }
 240         pix += line_size - 16;
 241     }
 242     return s;
 243 }
 244
 245 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
 246     int i;
 247
 248     for(i=0; i+8<=w; i+=8){
 249         dst[i+0]= bswap_32(src[i+0]);
 250         dst[i+1]= bswap_32(src[i+1]);
 251         dst[i+2]= bswap_32(src[i+2]);
 252         dst[i+3]= bswap_32(src[i+3]);
 253         dst[i+4]= bswap_32(src[i+4]);
 254         dst[i+5]= bswap_32(src[i+5]);
 255         dst[i+6]= bswap_32(src[i+6]);
 256         dst[i+7]= bswap_32(src[i+7]);
 257     }
 258     for(;i<w; i++){
 259         dst[i+0]= bswap_32(src[i+0]);
 260     }
 261 }
 262
 263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 264 {
 265     int s, i;
 266     uint32_t *sq = ff_squareTbl + 256;
 267
 268     s = 0;
 269     for (i = 0; i < h; i++) {
 270         s += sq[pix1[0] - pix2[0]];
 271         s += sq[pix1[1] - pix2[1]];
 272         s += sq[pix1[2] - pix2[2]];
 273         s += sq[pix1[3] - pix2[3]];
 274         pix1 += line_size;
 275         pix2 += line_size;
 276     }
 277     return s;
 278 }
 279
 280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
 281 {
 282     int s, i;
 283     uint32_t *sq = ff_squareTbl + 256;
 284
 285     s = 0;
 286     for (i = 0; i < h; i++) {
 287         s += sq[pix1[0] - pix2[0]];
 288         s += sq[pix1[1] - pix2[1]];
 289         s += sq[pix1[2] - pix2[2]];
 290         s += sq[pix1[3] - pix2[3]];
 291         s += sq[pix1[4] - pix2[4]];
 292         s += sq[pix1[5] - pix2[5]];
 293         s += sq[pix1[6] - pix2[6]];
 294         s += sq[pix1[7] - pix2[7]];
 295         pix1 += line_size;
 296         pix2 += line_size;
 297     }
 298     return s;
 299 }
 300
 301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
 302 {
 303     int s, i;
 304     uint32_t *sq = ff_squareTbl + 256;
 305
 306     s = 0;
 307     for (i = 0; i < h; i++) {
 308         s += sq[pix1[ 0] - pix2[ 0]];
 309         s += sq[pix1[ 1] - pix2[ 1]];
 310         s += sq[pix1[ 2] - pix2[ 2]];
 311         s += sq[pix1[ 3] - pix2[ 3]];
 312         s += sq[pix1[ 4] - pix2[ 4]];
 313         s += sq[pix1[ 5] - pix2[ 5]];
 314         s += sq[pix1[ 6] - pix2[ 6]];
 315         s += sq[pix1[ 7] - pix2[ 7]];
 316         s += sq[pix1[ 8] - pix2[ 8]];
 317         s += sq[pix1[ 9] - pix2[ 9]];
 318         s += sq[pix1[10] - pix2[10]];
 319         s += sq[pix1[11] - pix2[11]];
 320         s += sq[pix1[12] - pix2[12]];
 321         s += sq[pix1[13] - pix2[13]];
 322         s += sq[pix1[14] - pix2[14]];
 323         s += sq[pix1[15] - pix2[15]];
 324
 325         pix1 += line_size;
 326         pix2 += line_size;
 327     }
 328     return s;
 329 }
 330
 331 /* draw the edges of width 'w' of an image of size width, height */
 332 //FIXME check that this is ok for mpeg4 interlaced
 333 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
 334 {
 335     uint8_t *ptr, *last_line;
 336     int i;
 337
 338     last_line = buf + (height - 1) * wrap;
 339     for(i=0;i<w;i++) {
 340         /* top and bottom */
 341         memcpy(buf - (i + 1) * wrap, buf, width);
 342         memcpy(last_line + (i + 1) * wrap, last_line, width);
 343     }
 344     /* left and right */
 345     ptr = buf;
 346     for(i=0;i<height;i++) {
 347         memset(ptr - w, ptr[0], w);
 348         memset(ptr + width, ptr[width-1], w);
 349         ptr += wrap;
 350     }
 351     /* corners */
 352     for(i=0;i<w;i++) {
 353         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
 354         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
 355         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
 356         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
 357     }
 358 }
 359
 360 /**
 361  * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
 362  * @param buf destination buffer
 363  * @param src source buffer
 364  * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
 365  * @param block_w width of block
 366  * @param block_h height of block
 367  * @param src_x x coordinate of the top left sample of the block in the source buffer
 368  * @param src_y y coordinate of the top left sample of the block in the source buffer
 369  * @param w width of the source buffer
 370  * @param h height of the source buffer
 371  */
 372 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
 373                                     int src_x, int src_y, int w, int h){
 374     int x, y;
 375     int start_y, start_x, end_y, end_x;
 376
 377     if(src_y>= h){
 378         src+= (h-1-src_y)*linesize;
 379         src_y=h-1;
 380     }else if(src_y<=-block_h){
 381         src+= (1-block_h-src_y)*linesize;
 382         src_y=1-block_h;
 383     }
 384     if(src_x>= w){
 385         src+= (w-1-src_x);
 386         src_x=w-1;
 387     }else if(src_x<=-block_w){
 388         src+= (1-block_w-src_x);
 389         src_x=1-block_w;
 390     }
 391
 392     start_y= FFMAX(0, -src_y);
 393     start_x= FFMAX(0, -src_x);
 394     end_y= FFMIN(block_h, h-src_y);
 395     end_x= FFMIN(block_w, w-src_x);
 396
 397     // copy existing part
 398     for(y=start_y; y<end_y; y++){
 399         for(x=start_x; x<end_x; x++){
 400             buf[x + y*linesize]= src[x + y*linesize];
 401         }
 402     }
 403
 404     //top
 405     for(y=0; y<start_y; y++){
 406         for(x=start_x; x<end_x; x++){
 407             buf[x + y*linesize]= buf[x + start_y*linesize];
 408         }
 409     }
 410
 411     //bottom
 412     for(y=end_y; y<block_h; y++){
 413         for(x=start_x; x<end_x; x++){
 414             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
 415         }
 416     }
 417
 418     for(y=0; y<block_h; y++){
 419        //left
 420         for(x=0; x<start_x; x++){
 421             buf[x + y*linesize]= buf[start_x + y*linesize];
 422         }
 423
 424        //right
 425         for(x=end_x; x<block_w; x++){
 426             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
 427         }
 428     }
 429 }
 430
 431 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
 432 {
 433     int i;
 434
 435     /* read the pixels */
 436     for(i=0;i<8;i++) {
 437         block[0] = pixels[0];
 438         block[1] = pixels[1];
 439         block[2] = pixels[2];
 440         block[3] = pixels[3];
 441         block[4] = pixels[4];
 442         block[5] = pixels[5];
 443         block[6] = pixels[6];
 444         block[7] = pixels[7];
 445         pixels += line_size;
 446         block += 8;
 447     }
 448 }
 449
 450 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
 451                           const uint8_t *s2, int stride){
 452     int i;
 453
 454     /* read the pixels */
 455     for(i=0;i<8;i++) {
 456         block[0] = s1[0] - s2[0];
 457         block[1] = s1[1] - s2[1];
 458         block[2] = s1[2] - s2[2];
 459         block[3] = s1[3] - s2[3];
 460         block[4] = s1[4] - s2[4];
 461         block[5] = s1[5] - s2[5];
 462         block[6] = s1[6] - s2[6];
 463         block[7] = s1[7] - s2[7];
 464         s1 += stride;
 465         s2 += stride;
 466         block += 8;
 467     }
 468 }
 469
 470
 471 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 472                                  int line_size)
 473 {
 474     int i;
 475     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 476
 477     /* read the pixels */
 478     for(i=0;i<8;i++) {
 479         pixels[0] = cm[block[0]];
 480         pixels[1] = cm[block[1]];
 481         pixels[2] = cm[block[2]];
 482         pixels[3] = cm[block[3]];
 483         pixels[4] = cm[block[4]];
 484         pixels[5] = cm[block[5]];
 485         pixels[6] = cm[block[6]];
 486         pixels[7] = cm[block[7]];
 487
 488         pixels += line_size;
 489         block += 8;
 490     }
 491 }
 492
 493 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 494                                  int line_size)
 495 {
 496     int i;
 497     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 498
 499     /* read the pixels */
 500     for(i=0;i<4;i++) {
 501         pixels[0] = cm[block[0]];
 502         pixels[1] = cm[block[1]];
 503         pixels[2] = cm[block[2]];
 504         pixels[3] = cm[block[3]];
 505
 506         pixels += line_size;
 507         block += 8;
 508     }
 509 }
 510
 511 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 512                                  int line_size)
 513 {
 514     int i;
 515     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 516
 517     /* read the pixels */
 518     for(i=0;i<2;i++) {
 519         pixels[0] = cm[block[0]];
 520         pixels[1] = cm[block[1]];
 521
 522         pixels += line_size;
 523         block += 8;
 524     }
 525 }
 526
 527 static void put_signed_pixels_clamped_c(const DCTELEM *block,
 528                                         uint8_t *restrict pixels,
 529                                         int line_size)
 530 {
 531     int i, j;
 532
 533     for (i = 0; i < 8; i++) {
 534         for (j = 0; j < 8; j++) {
 535             if (*block < -128)
 536                 *pixels = 0;
 537             else if (*block > 127)
 538                 *pixels = 255;
 539             else
 540                 *pixels = (uint8_t)(*block + 128);
 541             block++;
 542             pixels++;
 543         }
 544         pixels += (line_size - 8);
 545     }
 546 }
 547
 548 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 549                                     int line_size)
 550 {
 551     int i;
 552
 553     /* read the pixels */
 554     for(i=0;i<8;i++) {
 555         pixels[0] = block[0];
 556         pixels[1] = block[1];
 557         pixels[2] = block[2];
 558         pixels[3] = block[3];
 559         pixels[4] = block[4];
 560         pixels[5] = block[5];
 561         pixels[6] = block[6];
 562         pixels[7] = block[7];
 563
 564         pixels += line_size;
 565         block += 8;
 566     }
 567 }
 568
 569 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
 570                           int line_size)
 571 {
 572     int i;
 573     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 574
 575     /* read the pixels */
 576     for(i=0;i<8;i++) {
 577         pixels[0] = cm[pixels[0] + block[0]];
 578         pixels[1] = cm[pixels[1] + block[1]];
 579         pixels[2] = cm[pixels[2] + block[2]];
 580         pixels[3] = cm[pixels[3] + block[3]];
 581         pixels[4] = cm[pixels[4] + block[4]];
 582         pixels[5] = cm[pixels[5] + block[5]];
 583         pixels[6] = cm[pixels[6] + block[6]];
 584         pixels[7] = cm[pixels[7] + block[7]];
 585         pixels += line_size;
 586         block += 8;
 587     }
 588 }
 589
 590 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
 591                           int line_size)
 592 {
 593     int i;
 594     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 595
 596     /* read the pixels */
 597     for(i=0;i<4;i++) {
 598         pixels[0] = cm[pixels[0] + block[0]];
 599         pixels[1] = cm[pixels[1] + block[1]];
 600         pixels[2] = cm[pixels[2] + block[2]];
 601         pixels[3] = cm[pixels[3] + block[3]];
 602         pixels += line_size;
 603         block += 8;
 604     }
 605 }
 606
 607 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
 608                           int line_size)
 609 {
 610     int i;
 611     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
 612
 613     /* read the pixels */
 614     for(i=0;i<2;i++) {
 615         pixels[0] = cm[pixels[0] + block[0]];
 616         pixels[1] = cm[pixels[1] + block[1]];
 617         pixels += line_size;
 618         block += 8;
 619     }
 620 }
 621
 622 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 623 {
 624     int i;
 625     for(i=0;i<8;i++) {
 626         pixels[0] += block[0];
 627         pixels[1] += block[1];
 628         pixels[2] += block[2];
 629         pixels[3] += block[3];
 630         pixels[4] += block[4];
 631         pixels[5] += block[5];
 632         pixels[6] += block[6];
 633         pixels[7] += block[7];
 634         pixels += line_size;
 635         block += 8;
 636     }
 637 }
 638
 639 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
 640 {
 641     int i;
 642     for(i=0;i<4;i++) {
 643         pixels[0] += block[0];
 644         pixels[1] += block[1];
 645         pixels[2] += block[2];
 646         pixels[3] += block[3];
 647         pixels += line_size;
 648         block += 4;
 649     }
 650 }
 651
 652 static int sum_abs_dctelem_c(DCTELEM *block)
 653 {
 654     int sum=0, i;
 655     for(i=0; i<64; i++)
 656         sum+= FFABS(block[i]);
 657     return sum;
 658 }
 659
 660 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
 661 {
 662     int i;
 663
 664     for (i = 0; i < h; i++) {
 665         memset(block, value, 16);
 666         block += line_size;
 667     }
 668 }
 669
 670 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
 671 {
 672     int i;
 673
 674     for (i = 0; i < h; i++) {
 675         memset(block, value, 8);
 676         block += line_size;
 677     }
 678 }
 679
 680 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
 681 {
 682     int i, j;
 683     uint16_t *dst1 = (uint16_t *) dst;
 684     uint16_t *dst2 = (uint16_t *)(dst + linesize);
 685
 686     for (j = 0; j < 8; j++) {
 687         for (i = 0; i < 8; i++) {
 688             dst1[i] = dst2[i] = src[i] * 0x0101;
 689         }
 690         src  += 8;
 691         dst1 += linesize;
 692         dst2 += linesize;
 693     }
 694 }
 695
 696 #if 0
 697
 698 #define PIXOP2(OPNAME, OP) \
 699 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 700 {\
 701     int i;\
 702     for(i=0; i<h; i++){\
 703         OP(*((uint64_t*)block), AV_RN64(pixels));\
 704         pixels+=line_size;\
 705         block +=line_size;\
 706     }\
 707 }\
 708 \
 709 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 710 {\
 711     int i;\
 712     for(i=0; i<h; i++){\
 713         const uint64_t a= AV_RN64(pixels  );\
 714         const uint64_t b= AV_RN64(pixels+1);\
 715         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 716         pixels+=line_size;\
 717         block +=line_size;\
 718     }\
 719 }\
 720 \
 721 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 722 {\
 723     int i;\
 724     for(i=0; i<h; i++){\
 725         const uint64_t a= AV_RN64(pixels  );\
 726         const uint64_t b= AV_RN64(pixels+1);\
 727         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 728         pixels+=line_size;\
 729         block +=line_size;\
 730     }\
 731 }\
 732 \
 733 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 734 {\
 735     int i;\
 736     for(i=0; i<h; i++){\
 737         const uint64_t a= AV_RN64(pixels          );\
 738         const uint64_t b= AV_RN64(pixels+line_size);\
 739         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 740         pixels+=line_size;\
 741         block +=line_size;\
 742     }\
 743 }\
 744 \
 745 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 746 {\
 747     int i;\
 748     for(i=0; i<h; i++){\
 749         const uint64_t a= AV_RN64(pixels          );\
 750         const uint64_t b= AV_RN64(pixels+line_size);\
 751         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
 752         pixels+=line_size;\
 753         block +=line_size;\
 754     }\
 755 }\
 756 \
 757 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 758 {\
 759         int i;\
 760         const uint64_t a= AV_RN64(pixels  );\
 761         const uint64_t b= AV_RN64(pixels+1);\
 762         uint64_t l0=  (a&0x0303030303030303ULL)\
 763                     + (b&0x0303030303030303ULL)\
 764                     + 0x0202020202020202ULL;\
 765         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 766                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 767         uint64_t l1,h1;\
 768 \
 769         pixels+=line_size;\
 770         for(i=0; i<h; i+=2){\
 771             uint64_t a= AV_RN64(pixels  );\
 772             uint64_t b= AV_RN64(pixels+1);\
 773             l1=  (a&0x0303030303030303ULL)\
 774                + (b&0x0303030303030303ULL);\
 775             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 776               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 777             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 778             pixels+=line_size;\
 779             block +=line_size;\
 780             a= AV_RN64(pixels  );\
 781             b= AV_RN64(pixels+1);\
 782             l0=  (a&0x0303030303030303ULL)\
 783                + (b&0x0303030303030303ULL)\
 784                + 0x0202020202020202ULL;\
 785             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 786               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 787             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 788             pixels+=line_size;\
 789             block +=line_size;\
 790         }\
 791 }\
 792 \
 793 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
 794 {\
 795         int i;\
 796         const uint64_t a= AV_RN64(pixels  );\
 797         const uint64_t b= AV_RN64(pixels+1);\
 798         uint64_t l0=  (a&0x0303030303030303ULL)\
 799                     + (b&0x0303030303030303ULL)\
 800                     + 0x0101010101010101ULL;\
 801         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 802                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 803         uint64_t l1,h1;\
 804 \
 805         pixels+=line_size;\
 806         for(i=0; i<h; i+=2){\
 807             uint64_t a= AV_RN64(pixels  );\
 808             uint64_t b= AV_RN64(pixels+1);\
 809             l1=  (a&0x0303030303030303ULL)\
 810                + (b&0x0303030303030303ULL);\
 811             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 812               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 813             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 814             pixels+=line_size;\
 815             block +=line_size;\
 816             a= AV_RN64(pixels  );\
 817             b= AV_RN64(pixels+1);\
 818             l0=  (a&0x0303030303030303ULL)\
 819                + (b&0x0303030303030303ULL)\
 820                + 0x0101010101010101ULL;\
 821             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
 822               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
 823             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
 824             pixels+=line_size;\
 825             block +=line_size;\
 826         }\
 827 }\
 828 \
 829 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
 830 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
 831 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
 832 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
 833 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
 834 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
 835 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
 836
 837 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
 838 #else // 64 bit variant
 839
 840 #define PIXOP2(OPNAME, OP) \
 841 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 842     int i;\
 843     for(i=0; i<h; i++){\
 844         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
 845         pixels+=line_size;\
 846         block +=line_size;\
 847     }\
 848 }\
 849 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 850     int i;\
 851     for(i=0; i<h; i++){\
 852         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 853         pixels+=line_size;\
 854         block +=line_size;\
 855     }\
 856 }\
 857 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 858     int i;\
 859     for(i=0; i<h; i++){\
 860         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
 861         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
 862         pixels+=line_size;\
 863         block +=line_size;\
 864     }\
 865 }\
 866 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 867     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
 868 }\
 869 \
 870 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 871                                                 int src_stride1, int src_stride2, int h){\
 872     int i;\
 873     for(i=0; i<h; i++){\
 874         uint32_t a,b;\
 875         a= AV_RN32(&src1[i*src_stride1  ]);\
 876         b= AV_RN32(&src2[i*src_stride2  ]);\
 877         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
 878         a= AV_RN32(&src1[i*src_stride1+4]);\
 879         b= AV_RN32(&src2[i*src_stride2+4]);\
 880         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
 881     }\
 882 }\
 883 \
 884 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 885                                                 int src_stride1, int src_stride2, int h){\
 886     int i;\
 887     for(i=0; i<h; i++){\
 888         uint32_t a,b;\
 889         a= AV_RN32(&src1[i*src_stride1  ]);\
 890         b= AV_RN32(&src2[i*src_stride2  ]);\
 891         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 892         a= AV_RN32(&src1[i*src_stride1+4]);\
 893         b= AV_RN32(&src2[i*src_stride2+4]);\
 894         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
 895     }\
 896 }\
 897 \
 898 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 899                                                 int src_stride1, int src_stride2, int h){\
 900     int i;\
 901     for(i=0; i<h; i++){\
 902         uint32_t a,b;\
 903         a= AV_RN32(&src1[i*src_stride1  ]);\
 904         b= AV_RN32(&src2[i*src_stride2  ]);\
 905         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 906     }\
 907 }\
 908 \
 909 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 910                                                 int src_stride1, int src_stride2, int h){\
 911     int i;\
 912     for(i=0; i<h; i++){\
 913         uint32_t a,b;\
 914         a= AV_RN16(&src1[i*src_stride1  ]);\
 915         b= AV_RN16(&src2[i*src_stride2  ]);\
 916         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
 917     }\
 918 }\
 919 \
 920 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 921                                                 int src_stride1, int src_stride2, int h){\
 922     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 923     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 924 }\
 925 \
 926 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
 927                                                 int src_stride1, int src_stride2, int h){\
 928     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
 929     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
 930 }\
 931 \
 932 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 933     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 934 }\
 935 \
 936 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 937     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 938 }\
 939 \
 940 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 941     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 942 }\
 943 \
 944 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 945     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 946 }\
 947 \
 948 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
 949                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
 950     int i;\
 951     for(i=0; i<h; i++){\
 952         uint32_t a, b, c, d, l0, l1, h0, h1;\
 953         a= AV_RN32(&src1[i*src_stride1]);\
 954         b= AV_RN32(&src2[i*src_stride2]);\
 955         c= AV_RN32(&src3[i*src_stride3]);\
 956         d= AV_RN32(&src4[i*src_stride4]);\
 957         l0=  (a&0x03030303UL)\
 958            + (b&0x03030303UL)\
 959            + 0x02020202UL;\
 960         h0= ((a&0xFCFCFCFCUL)>>2)\
 961           + ((b&0xFCFCFCFCUL)>>2);\
 962         l1=  (c&0x03030303UL)\
 963            + (d&0x03030303UL);\
 964         h1= ((c&0xFCFCFCFCUL)>>2)\
 965           + ((d&0xFCFCFCFCUL)>>2);\
 966         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 967         a= AV_RN32(&src1[i*src_stride1+4]);\
 968         b= AV_RN32(&src2[i*src_stride2+4]);\
 969         c= AV_RN32(&src3[i*src_stride3+4]);\
 970         d= AV_RN32(&src4[i*src_stride4+4]);\
 971         l0=  (a&0x03030303UL)\
 972            + (b&0x03030303UL)\
 973            + 0x02020202UL;\
 974         h0= ((a&0xFCFCFCFCUL)>>2)\
 975           + ((b&0xFCFCFCFCUL)>>2);\
 976         l1=  (c&0x03030303UL)\
 977            + (d&0x03030303UL);\
 978         h1= ((c&0xFCFCFCFCUL)>>2)\
 979           + ((d&0xFCFCFCFCUL)>>2);\
 980         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
 981     }\
 982 }\
 983 \
 984 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 985     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 986 }\
 987 \
 988 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 989     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 990 }\
 991 \
 992 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 993     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
 994 }\
 995 \
 996 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
 997     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
 998 }\
 999 \
1000 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1001                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002     int i;\
1003     for(i=0; i<h; i++){\
1004         uint32_t a, b, c, d, l0, l1, h0, h1;\
1005         a= AV_RN32(&src1[i*src_stride1]);\
1006         b= AV_RN32(&src2[i*src_stride2]);\
1007         c= AV_RN32(&src3[i*src_stride3]);\
1008         d= AV_RN32(&src4[i*src_stride4]);\
1009         l0=  (a&0x03030303UL)\
1010            + (b&0x03030303UL)\
1011            + 0x01010101UL;\
1012         h0= ((a&0xFCFCFCFCUL)>>2)\
1013           + ((b&0xFCFCFCFCUL)>>2);\
1014         l1=  (c&0x03030303UL)\
1015            + (d&0x03030303UL);\
1016         h1= ((c&0xFCFCFCFCUL)>>2)\
1017           + ((d&0xFCFCFCFCUL)>>2);\
1018         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1019         a= AV_RN32(&src1[i*src_stride1+4]);\
1020         b= AV_RN32(&src2[i*src_stride2+4]);\
1021         c= AV_RN32(&src3[i*src_stride3+4]);\
1022         d= AV_RN32(&src4[i*src_stride4+4]);\
1023         l0=  (a&0x03030303UL)\
1024            + (b&0x03030303UL)\
1025            + 0x01010101UL;\
1026         h0= ((a&0xFCFCFCFCUL)>>2)\
1027           + ((b&0xFCFCFCFCUL)>>2);\
1028         l1=  (c&0x03030303UL)\
1029            + (d&0x03030303UL);\
1030         h1= ((c&0xFCFCFCFCUL)>>2)\
1031           + ((d&0xFCFCFCFCUL)>>2);\
1032         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1033     }\
1034 }\
1035 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1036                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1037     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1038     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1039 }\
1040 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1041                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1042     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1043     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1044 }\
1045 \
1046 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1047 {\
1048         int i, a0, b0, a1, b1;\
1049         a0= pixels[0];\
1050         b0= pixels[1] + 2;\
1051         a0 += b0;\
1052         b0 += pixels[2];\
1053 \
1054         pixels+=line_size;\
1055         for(i=0; i<h; i+=2){\
1056             a1= pixels[0];\
1057             b1= pixels[1];\
1058             a1 += b1;\
1059             b1 += pixels[2];\
1060 \
1061             block[0]= (a1+a0)>>2; /* FIXME non put */\
1062             block[1]= (b1+b0)>>2;\
1063 \
1064             pixels+=line_size;\
1065             block +=line_size;\
1066 \
1067             a0= pixels[0];\
1068             b0= pixels[1] + 2;\
1069             a0 += b0;\
1070             b0 += pixels[2];\
1071 \
1072             block[0]= (a1+a0)>>2;\
1073             block[1]= (b1+b0)>>2;\
1074             pixels+=line_size;\
1075             block +=line_size;\
1076         }\
1077 }\
1078 \
1079 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080 {\
1081         int i;\
1082         const uint32_t a= AV_RN32(pixels  );\
1083         const uint32_t b= AV_RN32(pixels+1);\
1084         uint32_t l0=  (a&0x03030303UL)\
1085                     + (b&0x03030303UL)\
1086                     + 0x02020202UL;\
1087         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088                    + ((b&0xFCFCFCFCUL)>>2);\
1089         uint32_t l1,h1;\
1090 \
1091         pixels+=line_size;\
1092         for(i=0; i<h; i+=2){\
1093             uint32_t a= AV_RN32(pixels  );\
1094             uint32_t b= AV_RN32(pixels+1);\
1095             l1=  (a&0x03030303UL)\
1096                + (b&0x03030303UL);\
1097             h1= ((a&0xFCFCFCFCUL)>>2)\
1098               + ((b&0xFCFCFCFCUL)>>2);\
1099             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100             pixels+=line_size;\
1101             block +=line_size;\
1102             a= AV_RN32(pixels  );\
1103             b= AV_RN32(pixels+1);\
1104             l0=  (a&0x03030303UL)\
1105                + (b&0x03030303UL)\
1106                + 0x02020202UL;\
1107             h0= ((a&0xFCFCFCFCUL)>>2)\
1108               + ((b&0xFCFCFCFCUL)>>2);\
1109             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1110             pixels+=line_size;\
1111             block +=line_size;\
1112         }\
1113 }\
1114 \
1115 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1116 {\
1117     int j;\
1118     for(j=0; j<2; j++){\
1119         int i;\
1120         const uint32_t a= AV_RN32(pixels  );\
1121         const uint32_t b= AV_RN32(pixels+1);\
1122         uint32_t l0=  (a&0x03030303UL)\
1123                     + (b&0x03030303UL)\
1124                     + 0x02020202UL;\
1125         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1126                    + ((b&0xFCFCFCFCUL)>>2);\
1127         uint32_t l1,h1;\
1128 \
1129         pixels+=line_size;\
1130         for(i=0; i<h; i+=2){\
1131             uint32_t a= AV_RN32(pixels  );\
1132             uint32_t b= AV_RN32(pixels+1);\
1133             l1=  (a&0x03030303UL)\
1134                + (b&0x03030303UL);\
1135             h1= ((a&0xFCFCFCFCUL)>>2)\
1136               + ((b&0xFCFCFCFCUL)>>2);\
1137             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1138             pixels+=line_size;\
1139             block +=line_size;\
1140             a= AV_RN32(pixels  );\
1141             b= AV_RN32(pixels+1);\
1142             l0=  (a&0x03030303UL)\
1143                + (b&0x03030303UL)\
1144                + 0x02020202UL;\
1145             h0= ((a&0xFCFCFCFCUL)>>2)\
1146               + ((b&0xFCFCFCFCUL)>>2);\
1147             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1148             pixels+=line_size;\
1149             block +=line_size;\
1150         }\
1151         pixels+=4-line_size*(h+1);\
1152         block +=4-line_size*h;\
1153     }\
1154 }\
1155 \
1156 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1157 {\
1158     int j;\
1159     for(j=0; j<2; j++){\
1160         int i;\
1161         const uint32_t a= AV_RN32(pixels  );\
1162         const uint32_t b= AV_RN32(pixels+1);\
1163         uint32_t l0=  (a&0x03030303UL)\
1164                     + (b&0x03030303UL)\
1165                     + 0x01010101UL;\
1166         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1167                    + ((b&0xFCFCFCFCUL)>>2);\
1168         uint32_t l1,h1;\
1169 \
1170         pixels+=line_size;\
1171         for(i=0; i<h; i+=2){\
1172             uint32_t a= AV_RN32(pixels  );\
1173             uint32_t b= AV_RN32(pixels+1);\
1174             l1=  (a&0x03030303UL)\
1175                + (b&0x03030303UL);\
1176             h1= ((a&0xFCFCFCFCUL)>>2)\
1177               + ((b&0xFCFCFCFCUL)>>2);\
1178             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1179             pixels+=line_size;\
1180             block +=line_size;\
1181             a= AV_RN32(pixels  );\
1182             b= AV_RN32(pixels+1);\
1183             l0=  (a&0x03030303UL)\
1184                + (b&0x03030303UL)\
1185                + 0x01010101UL;\
1186             h0= ((a&0xFCFCFCFCUL)>>2)\
1187               + ((b&0xFCFCFCFCUL)>>2);\
1188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189             pixels+=line_size;\
1190             block +=line_size;\
1191         }\
1192         pixels+=4-line_size*(h+1);\
1193         block +=4-line_size*h;\
1194     }\
1195 }\
1196 \
1197 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
1198 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1199 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1200 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1201 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
1202 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1203 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1204 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1205
1206 #define op_avg(a, b) a = rnd_avg32(a, b)
1207 #endif
1208 #define op_put(a, b) a = b
1209
1210 PIXOP2(avg, op_avg)
1211 PIXOP2(put, op_put)
1212 #undef op_avg
1213 #undef op_put
1214
1215 #define avg2(a,b) ((a+b+1)>>1)
1216 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1217
1218 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1219     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1220 }
1221
1222 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1223     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1224 }
1225
1226 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1227 {
1228     const int A=(16-x16)*(16-y16);
1229     const int B=(   x16)*(16-y16);
1230     const int C=(16-x16)*(   y16);
1231     const int D=(   x16)*(   y16);
1232     int i;
1233
1234     for(i=0; i<h; i++)
1235     {
1236         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1237         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1238         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1239         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1240         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1241         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1242         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1243         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1244         dst+= stride;
1245         src+= stride;
1246     }
1247 }
1248
1249 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1250                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1251 {
1252     int y, vx, vy;
1253     const int s= 1<<shift;
1254
1255     width--;
1256     height--;
1257
1258     for(y=0; y<h; y++){
1259         int x;
1260
1261         vx= ox;
1262         vy= oy;
1263         for(x=0; x<8; x++){ //XXX FIXME optimize
1264             int src_x, src_y, frac_x, frac_y, index;
1265
1266             src_x= vx>>16;
1267             src_y= vy>>16;
1268             frac_x= src_x&(s-1);
1269             frac_y= src_y&(s-1);
1270             src_x>>=shift;
1271             src_y>>=shift;
1272
1273             if((unsigned)src_x < width){
1274                 if((unsigned)src_y < height){
1275                     index= src_x + src_y*stride;
1276                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
1277                                            + src[index       +1]*   frac_x )*(s-frac_y)
1278                                         + (  src[index+stride  ]*(s-frac_x)
1279                                            + src[index+stride+1]*   frac_x )*   frac_y
1280                                         + r)>>(shift*2);
1281                 }else{
1282                     index= src_x + av_clip(src_y, 0, height)*stride;
1283                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
1284                                           + src[index       +1]*   frac_x )*s
1285                                         + r)>>(shift*2);
1286                 }
1287             }else{
1288                 if((unsigned)src_y < height){
1289                     index= av_clip(src_x, 0, width) + src_y*stride;
1290                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
1291                                            + src[index+stride  ]*   frac_y )*s
1292                                         + r)>>(shift*2);
1293                 }else{
1294                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1295                     dst[y*stride + x]=    src[index         ];
1296                 }
1297             }
1298
1299             vx+= dxx;
1300             vy+= dyx;
1301         }
1302         ox += dxy;
1303         oy += dyy;
1304     }
1305 }
1306
1307 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308     switch(width){
1309     case 2: put_pixels2_c (dst, src, stride, height); break;
1310     case 4: put_pixels4_c (dst, src, stride, height); break;
1311     case 8: put_pixels8_c (dst, src, stride, height); break;
1312     case 16:put_pixels16_c(dst, src, stride, height); break;
1313     }
1314 }
1315
1316 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317     int i,j;
1318     for (i=0; i < height; i++) {
1319       for (j=0; j < width; j++) {
1320         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1321       }
1322       src += stride;
1323       dst += stride;
1324     }
1325 }
1326
1327 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328     int i,j;
1329     for (i=0; i < height; i++) {
1330       for (j=0; j < width; j++) {
1331         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1332       }
1333       src += stride;
1334       dst += stride;
1335     }
1336 }
1337
1338 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339     int i,j;
1340     for (i=0; i < height; i++) {
1341       for (j=0; j < width; j++) {
1342         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1343       }
1344       src += stride;
1345       dst += stride;
1346     }
1347 }
1348
1349 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350     int i,j;
1351     for (i=0; i < height; i++) {
1352       for (j=0; j < width; j++) {
1353         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1354       }
1355       src += stride;
1356       dst += stride;
1357     }
1358 }
1359
1360 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361     int i,j;
1362     for (i=0; i < height; i++) {
1363       for (j=0; j < width; j++) {
1364         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365       }
1366       src += stride;
1367       dst += stride;
1368     }
1369 }
1370
1371 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372     int i,j;
1373     for (i=0; i < height; i++) {
1374       for (j=0; j < width; j++) {
1375         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1376       }
1377       src += stride;
1378       dst += stride;
1379     }
1380 }
1381
1382 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383     int i,j;
1384     for (i=0; i < height; i++) {
1385       for (j=0; j < width; j++) {
1386         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1387       }
1388       src += stride;
1389       dst += stride;
1390     }
1391 }
1392
1393 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394     int i,j;
1395     for (i=0; i < height; i++) {
1396       for (j=0; j < width; j++) {
1397         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1398       }
1399       src += stride;
1400       dst += stride;
1401     }
1402 }
1403
1404 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405     switch(width){
1406     case 2: avg_pixels2_c (dst, src, stride, height); break;
1407     case 4: avg_pixels4_c (dst, src, stride, height); break;
1408     case 8: avg_pixels8_c (dst, src, stride, height); break;
1409     case 16:avg_pixels16_c(dst, src, stride, height); break;
1410     }
1411 }
1412
1413 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414     int i,j;
1415     for (i=0; i < height; i++) {
1416       for (j=0; j < width; j++) {
1417         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1418       }
1419       src += stride;
1420       dst += stride;
1421     }
1422 }
1423
1424 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425     int i,j;
1426     for (i=0; i < height; i++) {
1427       for (j=0; j < width; j++) {
1428         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1429       }
1430       src += stride;
1431       dst += stride;
1432     }
1433 }
1434
1435 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436     int i,j;
1437     for (i=0; i < height; i++) {
1438       for (j=0; j < width; j++) {
1439         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1440       }
1441       src += stride;
1442       dst += stride;
1443     }
1444 }
1445
1446 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447     int i,j;
1448     for (i=0; i < height; i++) {
1449       for (j=0; j < width; j++) {
1450         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451       }
1452       src += stride;
1453       dst += stride;
1454     }
1455 }
1456
1457 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458     int i,j;
1459     for (i=0; i < height; i++) {
1460       for (j=0; j < width; j++) {
1461         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462       }
1463       src += stride;
1464       dst += stride;
1465     }
1466 }
1467
1468 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469     int i,j;
1470     for (i=0; i < height; i++) {
1471       for (j=0; j < width; j++) {
1472         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1473       }
1474       src += stride;
1475       dst += stride;
1476     }
1477 }
1478
1479 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480     int i,j;
1481     for (i=0; i < height; i++) {
1482       for (j=0; j < width; j++) {
1483         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484       }
1485       src += stride;
1486       dst += stride;
1487     }
1488 }
1489
1490 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491     int i,j;
1492     for (i=0; i < height; i++) {
1493       for (j=0; j < width; j++) {
1494         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1495       }
1496       src += stride;
1497       dst += stride;
1498     }
1499 }
1500 #if 0
1501 #define TPEL_WIDTH(width)\
1502 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1503     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1504 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1505     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1506 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1507     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1508 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1509     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1510 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1511     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1512 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1513     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1514 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1515     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1516 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1517     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1518 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1519     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1520 #endif
1521
1522 #define H264_CHROMA_MC(OPNAME, OP)\
1523 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524     const int A=(8-x)*(8-y);\
1525     const int B=(  x)*(8-y);\
1526     const int C=(8-x)*(  y);\
1527     const int D=(  x)*(  y);\
1528     int i;\
1529     \
1530     assert(x<8 && y<8 && x>=0 && y>=0);\
1531 \
1532     if(D){\
1533         for(i=0; i<h; i++){\
1534             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536             dst+= stride;\
1537             src+= stride;\
1538         }\
1539     }else{\
1540         const int E= B+C;\
1541         const int step= C ? stride : 1;\
1542         for(i=0; i<h; i++){\
1543             OP(dst[0], (A*src[0] + E*src[step+0]));\
1544             OP(dst[1], (A*src[1] + E*src[step+1]));\
1545             dst+= stride;\
1546             src+= stride;\
1547         }\
1548     }\
1549 }\
1550 \
1551 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1552     const int A=(8-x)*(8-y);\
1553     const int B=(  x)*(8-y);\
1554     const int C=(8-x)*(  y);\
1555     const int D=(  x)*(  y);\
1556     int i;\
1557     \
1558     assert(x<8 && y<8 && x>=0 && y>=0);\
1559 \
1560     if(D){\
1561         for(i=0; i<h; i++){\
1562             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1563             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1564             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1565             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1566             dst+= stride;\
1567             src+= stride;\
1568         }\
1569     }else{\
1570         const int E= B+C;\
1571         const int step= C ? stride : 1;\
1572         for(i=0; i<h; i++){\
1573             OP(dst[0], (A*src[0] + E*src[step+0]));\
1574             OP(dst[1], (A*src[1] + E*src[step+1]));\
1575             OP(dst[2], (A*src[2] + E*src[step+2]));\
1576             OP(dst[3], (A*src[3] + E*src[step+3]));\
1577             dst+= stride;\
1578             src+= stride;\
1579         }\
1580     }\
1581 }\
1582 \
1583 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1584     const int A=(8-x)*(8-y);\
1585     const int B=(  x)*(8-y);\
1586     const int C=(8-x)*(  y);\
1587     const int D=(  x)*(  y);\
1588     int i;\
1589     \
1590     assert(x<8 && y<8 && x>=0 && y>=0);\
1591 \
1592     if(D){\
1593         for(i=0; i<h; i++){\
1594             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1595             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1596             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1597             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1598             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1599             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1600             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1601             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1602             dst+= stride;\
1603             src+= stride;\
1604         }\
1605     }else{\
1606         const int E= B+C;\
1607         const int step= C ? stride : 1;\
1608         for(i=0; i<h; i++){\
1609             OP(dst[0], (A*src[0] + E*src[step+0]));\
1610             OP(dst[1], (A*src[1] + E*src[step+1]));\
1611             OP(dst[2], (A*src[2] + E*src[step+2]));\
1612             OP(dst[3], (A*src[3] + E*src[step+3]));\
1613             OP(dst[4], (A*src[4] + E*src[step+4]));\
1614             OP(dst[5], (A*src[5] + E*src[step+5]));\
1615             OP(dst[6], (A*src[6] + E*src[step+6]));\
1616             OP(dst[7], (A*src[7] + E*src[step+7]));\
1617             dst+= stride;\
1618             src+= stride;\
1619         }\
1620     }\
1621 }
1622
1623 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1624 #define op_put(a, b) a = (((b) + 32)>>6)
1625
1626 H264_CHROMA_MC(put_       , op_put)
1627 H264_CHROMA_MC(avg_       , op_avg)
1628 #undef op_avg
1629 #undef op_put
1630
1631 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1632     const int A=(8-x)*(8-y);
1633     const int B=(  x)*(8-y);
1634     const int C=(8-x)*(  y);
1635     const int D=(  x)*(  y);
1636     int i;
1637
1638     assert(x<8 && y<8 && x>=0 && y>=0);
1639
1640     for(i=0; i<h; i++)
1641     {
1642         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1643         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1644         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1645         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1646         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1647         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1648         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1649         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1650         dst+= stride;
1651         src+= stride;
1652     }
1653 }
1654
1655 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1656     const int A=(8-x)*(8-y);
1657     const int B=(  x)*(8-y);
1658     const int C=(8-x)*(  y);
1659     const int D=(  x)*(  y);
1660     int i;
1661
1662     assert(x<8 && y<8 && x>=0 && y>=0);
1663
1664     for(i=0; i<h; i++)
1665     {
1666         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1667         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1668         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1669         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1670         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1671         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1672         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1673         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1674         dst+= stride;
1675         src+= stride;
1676     }
1677 }
1678
1679 #define QPEL_MC(r, OPNAME, RND, OP) \
1680 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1682     int i;\
1683     for(i=0; i<h; i++)\
1684     {\
1685         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1686         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1687         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1688         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1689         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1690         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1691         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1692         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1693         dst+=dstStride;\
1694         src+=srcStride;\
1695     }\
1696 }\
1697 \
1698 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1699     const int w=8;\
1700     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1701     int i;\
1702     for(i=0; i<w; i++)\
1703     {\
1704         const int src0= src[0*srcStride];\
1705         const int src1= src[1*srcStride];\
1706         const int src2= src[2*srcStride];\
1707         const int src3= src[3*srcStride];\
1708         const int src4= src[4*srcStride];\
1709         const int src5= src[5*srcStride];\
1710         const int src6= src[6*srcStride];\
1711         const int src7= src[7*srcStride];\
1712         const int src8= src[8*srcStride];\
1713         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1714         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1715         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1716         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1717         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1718         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1719         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1720         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1721         dst++;\
1722         src++;\
1723     }\
1724 }\
1725 \
1726 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1727     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728     int i;\
1729     \
1730     for(i=0; i<h; i++)\
1731     {\
1732         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1733         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1734         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1735         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1736         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1737         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1738         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1739         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1740         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1741         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1742         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1743         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1744         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1745         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1746         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1747         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1748         dst+=dstStride;\
1749         src+=srcStride;\
1750     }\
1751 }\
1752 \
1753 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755     int i;\
1756     const int w=16;\
1757     for(i=0; i<w; i++)\
1758     {\
1759         const int src0= src[0*srcStride];\
1760         const int src1= src[1*srcStride];\
1761         const int src2= src[2*srcStride];\
1762         const int src3= src[3*srcStride];\
1763         const int src4= src[4*srcStride];\
1764         const int src5= src[5*srcStride];\
1765         const int src6= src[6*srcStride];\
1766         const int src7= src[7*srcStride];\
1767         const int src8= src[8*srcStride];\
1768         const int src9= src[9*srcStride];\
1769         const int src10= src[10*srcStride];\
1770         const int src11= src[11*srcStride];\
1771         const int src12= src[12*srcStride];\
1772         const int src13= src[13*srcStride];\
1773         const int src14= src[14*srcStride];\
1774         const int src15= src[15*srcStride];\
1775         const int src16= src[16*srcStride];\
1776         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1777         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1778         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1779         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1780         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1781         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1782         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1783         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1784         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1785         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1786         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1787         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1788         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1789         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1790         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1791         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1792         dst++;\
1793         src++;\
1794     }\
1795 }\
1796 \
1797 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1798     OPNAME ## pixels8_c(dst, src, stride, 8);\
1799 }\
1800 \
1801 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1802     uint8_t half[64];\
1803     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1804     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1805 }\
1806 \
1807 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1808     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1809 }\
1810 \
1811 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1812     uint8_t half[64];\
1813     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1814     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1815 }\
1816 \
1817 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1818     uint8_t full[16*9];\
1819     uint8_t half[64];\
1820     copy_block9(full, src, 16, stride, 9);\
1821     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1822     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1823 }\
1824 \
1825 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1826     uint8_t full[16*9];\
1827     copy_block9(full, src, 16, stride, 9);\
1828     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1829 }\
1830 \
1831 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1832     uint8_t full[16*9];\
1833     uint8_t half[64];\
1834     copy_block9(full, src, 16, stride, 9);\
1835     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1836     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1837 }\
1838 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1839     uint8_t full[16*9];\
1840     uint8_t halfH[72];\
1841     uint8_t halfV[64];\
1842     uint8_t halfHV[64];\
1843     copy_block9(full, src, 16, stride, 9);\
1844     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1846     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1848 }\
1849 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1850     uint8_t full[16*9];\
1851     uint8_t halfH[72];\
1852     uint8_t halfHV[64];\
1853     copy_block9(full, src, 16, stride, 9);\
1854     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1856     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1858 }\
1859 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860     uint8_t full[16*9];\
1861     uint8_t halfH[72];\
1862     uint8_t halfV[64];\
1863     uint8_t halfHV[64];\
1864     copy_block9(full, src, 16, stride, 9);\
1865     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1867     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1869 }\
1870 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1871     uint8_t full[16*9];\
1872     uint8_t halfH[72];\
1873     uint8_t halfHV[64];\
1874     copy_block9(full, src, 16, stride, 9);\
1875     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1877     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1879 }\
1880 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1881     uint8_t full[16*9];\
1882     uint8_t halfH[72];\
1883     uint8_t halfV[64];\
1884     uint8_t halfHV[64];\
1885     copy_block9(full, src, 16, stride, 9);\
1886     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1887     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1888     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1890 }\
1891 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1892     uint8_t full[16*9];\
1893     uint8_t halfH[72];\
1894     uint8_t halfHV[64];\
1895     copy_block9(full, src, 16, stride, 9);\
1896     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1898     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1899     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1900 }\
1901 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1902     uint8_t full[16*9];\
1903     uint8_t halfH[72];\
1904     uint8_t halfV[64];\
1905     uint8_t halfHV[64];\
1906     copy_block9(full, src, 16, stride, 9);\
1907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
1908     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1909     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1910     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1911 }\
1912 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1913     uint8_t full[16*9];\
1914     uint8_t halfH[72];\
1915     uint8_t halfHV[64];\
1916     copy_block9(full, src, 16, stride, 9);\
1917     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1918     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1919     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1920     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1921 }\
1922 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1923     uint8_t halfH[72];\
1924     uint8_t halfHV[64];\
1925     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1926     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1928 }\
1929 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1930     uint8_t halfH[72];\
1931     uint8_t halfHV[64];\
1932     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1933     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1934     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1935 }\
1936 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1937     uint8_t full[16*9];\
1938     uint8_t halfH[72];\
1939     uint8_t halfV[64];\
1940     uint8_t halfHV[64];\
1941     copy_block9(full, src, 16, stride, 9);\
1942     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1943     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1944     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1946 }\
1947 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1948     uint8_t full[16*9];\
1949     uint8_t halfH[72];\
1950     copy_block9(full, src, 16, stride, 9);\
1951     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1954 }\
1955 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956     uint8_t full[16*9];\
1957     uint8_t halfH[72];\
1958     uint8_t halfV[64];\
1959     uint8_t halfHV[64];\
1960     copy_block9(full, src, 16, stride, 9);\
1961     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1962     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965 }\
1966 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1967     uint8_t full[16*9];\
1968     uint8_t halfH[72];\
1969     copy_block9(full, src, 16, stride, 9);\
1970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1972     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973 }\
1974 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1975     uint8_t halfH[72];\
1976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1977     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978 }\
1979 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1980     OPNAME ## pixels16_c(dst, src, stride, 16);\
1981 }\
1982 \
1983 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1984     uint8_t half[256];\
1985     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1986     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1987 }\
1988 \
1989 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1990     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1991 }\
1992 \
1993 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1994     uint8_t half[256];\
1995     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1996     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1997 }\
1998 \
1999 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2000     uint8_t full[24*17];\
2001     uint8_t half[256];\
2002     copy_block17(full, src, 24, stride, 17);\
2003     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2004     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2005 }\
2006 \
2007 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2008     uint8_t full[24*17];\
2009     copy_block17(full, src, 24, stride, 17);\
2010     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2011 }\
2012 \
2013 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2014     uint8_t full[24*17];\
2015     uint8_t half[256];\
2016     copy_block17(full, src, 24, stride, 17);\
2017     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2018     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2019 }\
2020 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021     uint8_t full[24*17];\
2022     uint8_t halfH[272];\
2023     uint8_t halfV[256];\
2024     uint8_t halfHV[256];\
2025     copy_block17(full, src, 24, stride, 17);\
2026     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2030 }\
2031 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2032     uint8_t full[24*17];\
2033     uint8_t halfH[272];\
2034     uint8_t halfHV[256];\
2035     copy_block17(full, src, 24, stride, 17);\
2036     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2040 }\
2041 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042     uint8_t full[24*17];\
2043     uint8_t halfH[272];\
2044     uint8_t halfV[256];\
2045     uint8_t halfHV[256];\
2046     copy_block17(full, src, 24, stride, 17);\
2047     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2051 }\
2052 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2053     uint8_t full[24*17];\
2054     uint8_t halfH[272];\
2055     uint8_t halfHV[256];\
2056     copy_block17(full, src, 24, stride, 17);\
2057     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2061 }\
2062 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2063     uint8_t full[24*17];\
2064     uint8_t halfH[272];\
2065     uint8_t halfV[256];\
2066     uint8_t halfHV[256];\
2067     copy_block17(full, src, 24, stride, 17);\
2068     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2069     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2070     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2071     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2072 }\
2073 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2074     uint8_t full[24*17];\
2075     uint8_t halfH[272];\
2076     uint8_t halfHV[256];\
2077     copy_block17(full, src, 24, stride, 17);\
2078     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2079     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2080     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2082 }\
2083 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084     uint8_t full[24*17];\
2085     uint8_t halfH[272];\
2086     uint8_t halfV[256];\
2087     uint8_t halfHV[256];\
2088     copy_block17(full, src, 24, stride, 17);\
2089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
2090     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2093 }\
2094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2095     uint8_t full[24*17];\
2096     uint8_t halfH[272];\
2097     uint8_t halfHV[256];\
2098     copy_block17(full, src, 24, stride, 17);\
2099     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2100     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2101     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2102     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2103 }\
2104 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2105     uint8_t halfH[272];\
2106     uint8_t halfHV[256];\
2107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2108     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2110 }\
2111 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2112     uint8_t halfH[272];\
2113     uint8_t halfHV[256];\
2114     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2116     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2117 }\
2118 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2119     uint8_t full[24*17];\
2120     uint8_t halfH[272];\
2121     uint8_t halfV[256];\
2122     uint8_t halfHV[256];\
2123     copy_block17(full, src, 24, stride, 17);\
2124     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2125     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2126     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2128 }\
2129 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2130     uint8_t full[24*17];\
2131     uint8_t halfH[272];\
2132     copy_block17(full, src, 24, stride, 17);\
2133     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2136 }\
2137 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138     uint8_t full[24*17];\
2139     uint8_t halfH[272];\
2140     uint8_t halfV[256];\
2141     uint8_t halfHV[256];\
2142     copy_block17(full, src, 24, stride, 17);\
2143     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2144     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147 }\
2148 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2149     uint8_t full[24*17];\
2150     uint8_t halfH[272];\
2151     copy_block17(full, src, 24, stride, 17);\
2152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2154     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155 }\
2156 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2157     uint8_t halfH[272];\
2158     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2159     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160 }
2161
2162 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2163 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2164 #define op_put(a, b) a = cm[((b) + 16)>>5]
2165 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2166
2167 QPEL_MC(0, put_       , _       , op_put)
2168 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2169 QPEL_MC(0, avg_       , _       , op_avg)
2170 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
2171 #undef op_avg
2172 #undef op_avg_no_rnd
2173 #undef op_put
2174 #undef op_put_no_rnd
2175
2176 #if 1
2177 #define H264_LOWPASS(OPNAME, OP, OP2) \
2178 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2179     const int h=2;\
2180     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181     int i;\
2182     for(i=0; i<h; i++)\
2183     {\
2184         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2185         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2186         dst+=dstStride;\
2187         src+=srcStride;\
2188     }\
2189 }\
2190 \
2191 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192     const int w=2;\
2193     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2194     int i;\
2195     for(i=0; i<w; i++)\
2196     {\
2197         const int srcB= src[-2*srcStride];\
2198         const int srcA= src[-1*srcStride];\
2199         const int src0= src[0 *srcStride];\
2200         const int src1= src[1 *srcStride];\
2201         const int src2= src[2 *srcStride];\
2202         const int src3= src[3 *srcStride];\
2203         const int src4= src[4 *srcStride];\
2204         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2205         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2206         dst++;\
2207         src++;\
2208     }\
2209 }\
2210 \
2211 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212     const int h=2;\
2213     const int w=2;\
2214     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215     int i;\
2216     src -= 2*srcStride;\
2217     for(i=0; i<h+5; i++)\
2218     {\
2219         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2220         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2221         tmp+=tmpStride;\
2222         src+=srcStride;\
2223     }\
2224     tmp -= tmpStride*(h+5-2);\
2225     for(i=0; i<w; i++)\
2226     {\
2227         const int tmpB= tmp[-2*tmpStride];\
2228         const int tmpA= tmp[-1*tmpStride];\
2229         const int tmp0= tmp[0 *tmpStride];\
2230         const int tmp1= tmp[1 *tmpStride];\
2231         const int tmp2= tmp[2 *tmpStride];\
2232         const int tmp3= tmp[3 *tmpStride];\
2233         const int tmp4= tmp[4 *tmpStride];\
2234         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2235         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2236         dst++;\
2237         tmp++;\
2238     }\
2239 }\
2240 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241     const int h=4;\
2242     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243     int i;\
2244     for(i=0; i<h; i++)\
2245     {\
2246         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2247         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2248         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2249         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2250         dst+=dstStride;\
2251         src+=srcStride;\
2252     }\
2253 }\
2254 \
2255 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2256     const int w=4;\
2257     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258     int i;\
2259     for(i=0; i<w; i++)\
2260     {\
2261         const int srcB= src[-2*srcStride];\
2262         const int srcA= src[-1*srcStride];\
2263         const int src0= src[0 *srcStride];\
2264         const int src1= src[1 *srcStride];\
2265         const int src2= src[2 *srcStride];\
2266         const int src3= src[3 *srcStride];\
2267         const int src4= src[4 *srcStride];\
2268         const int src5= src[5 *srcStride];\
2269         const int src6= src[6 *srcStride];\
2270         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2271         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2272         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2273         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2274         dst++;\
2275         src++;\
2276     }\
2277 }\
2278 \
2279 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2280     const int h=4;\
2281     const int w=4;\
2282     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2283     int i;\
2284     src -= 2*srcStride;\
2285     for(i=0; i<h+5; i++)\
2286     {\
2287         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2288         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2289         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2290         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2291         tmp+=tmpStride;\
2292         src+=srcStride;\
2293     }\
2294     tmp -= tmpStride*(h+5-2);\
2295     for(i=0; i<w; i++)\
2296     {\
2297         const int tmpB= tmp[-2*tmpStride];\
2298         const int tmpA= tmp[-1*tmpStride];\
2299         const int tmp0= tmp[0 *tmpStride];\
2300         const int tmp1= tmp[1 *tmpStride];\
2301         const int tmp2= tmp[2 *tmpStride];\
2302         const int tmp3= tmp[3 *tmpStride];\
2303         const int tmp4= tmp[4 *tmpStride];\
2304         const int tmp5= tmp[5 *tmpStride];\
2305         const int tmp6= tmp[6 *tmpStride];\
2306         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2307         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2308         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2309         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2310         dst++;\
2311         tmp++;\
2312     }\
2313 }\
2314 \
2315 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2316     const int h=8;\
2317     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2318     int i;\
2319     for(i=0; i<h; i++)\
2320     {\
2321         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2322         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2323         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2324         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2325         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2326         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2327         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2328         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2329         dst+=dstStride;\
2330         src+=srcStride;\
2331     }\
2332 }\
2333 \
2334 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335     const int w=8;\
2336     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337     int i;\
2338     for(i=0; i<w; i++)\
2339     {\
2340         const int srcB= src[-2*srcStride];\
2341         const int srcA= src[-1*srcStride];\
2342         const int src0= src[0 *srcStride];\
2343         const int src1= src[1 *srcStride];\
2344         const int src2= src[2 *srcStride];\
2345         const int src3= src[3 *srcStride];\
2346         const int src4= src[4 *srcStride];\
2347         const int src5= src[5 *srcStride];\
2348         const int src6= src[6 *srcStride];\
2349         const int src7= src[7 *srcStride];\
2350         const int src8= src[8 *srcStride];\
2351         const int src9= src[9 *srcStride];\
2352         const int src10=src[10*srcStride];\
2353         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2354         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2355         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2356         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2357         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2358         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2359         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2360         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2361         dst++;\
2362         src++;\
2363     }\
2364 }\
2365 \
2366 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2367     const int h=8;\
2368     const int w=8;\
2369     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2370     int i;\
2371     src -= 2*srcStride;\
2372     for(i=0; i<h+5; i++)\
2373     {\
2374         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2375         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2376         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2377         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2378         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2379         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2380         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2381         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2382         tmp+=tmpStride;\
2383         src+=srcStride;\
2384     }\
2385     tmp -= tmpStride*(h+5-2);\
2386     for(i=0; i<w; i++)\
2387     {\
2388         const int tmpB= tmp[-2*tmpStride];\
2389         const int tmpA= tmp[-1*tmpStride];\
2390         const int tmp0= tmp[0 *tmpStride];\
2391         const int tmp1= tmp[1 *tmpStride];\
2392         const int tmp2= tmp[2 *tmpStride];\
2393         const int tmp3= tmp[3 *tmpStride];\
2394         const int tmp4= tmp[4 *tmpStride];\
2395         const int tmp5= tmp[5 *tmpStride];\
2396         const int tmp6= tmp[6 *tmpStride];\
2397         const int tmp7= tmp[7 *tmpStride];\
2398         const int tmp8= tmp[8 *tmpStride];\
2399         const int tmp9= tmp[9 *tmpStride];\
2400         const int tmp10=tmp[10*tmpStride];\
2401         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2402         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2403         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2404         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2405         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2406         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2407         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2408         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2409         dst++;\
2410         tmp++;\
2411     }\
2412 }\
2413 \
2414 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2415     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2416     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2417     src += 8*srcStride;\
2418     dst += 8*dstStride;\
2419     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
2420     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2421 }\
2422 \
2423 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2424     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2425     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2426     src += 8*srcStride;\
2427     dst += 8*dstStride;\
2428     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
2429     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2430 }\
2431 \
2432 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2433     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2434     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2435     src += 8*srcStride;\
2436     dst += 8*dstStride;\
2437     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
2438     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2439 }\
2440
2441 #define H264_MC(OPNAME, SIZE) \
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2443     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2444 }\
2445 \
2446 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2447     uint8_t half[SIZE*SIZE];\
2448     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2449     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2450 }\
2451 \
2452 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2453     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2454 }\
2455 \
2456 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2457     uint8_t half[SIZE*SIZE];\
2458     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2459     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2460 }\
2461 \
2462 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2463     uint8_t full[SIZE*(SIZE+5)];\
2464     uint8_t * const full_mid= full + SIZE*2;\
2465     uint8_t half[SIZE*SIZE];\
2466     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2467     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2468     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2469 }\
2470 \
2471 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2472     uint8_t full[SIZE*(SIZE+5)];\
2473     uint8_t * const full_mid= full + SIZE*2;\
2474     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2475     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2476 }\
2477 \
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2479     uint8_t full[SIZE*(SIZE+5)];\
2480     uint8_t * const full_mid= full + SIZE*2;\
2481     uint8_t half[SIZE*SIZE];\
2482     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2483     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2484     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2485 }\
2486 \
2487 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2488     uint8_t full[SIZE*(SIZE+5)];\
2489     uint8_t * const full_mid= full + SIZE*2;\
2490     uint8_t halfH[SIZE*SIZE];\
2491     uint8_t halfV[SIZE*SIZE];\
2492     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2493     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2494     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496 }\
2497 \
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2499     uint8_t full[SIZE*(SIZE+5)];\
2500     uint8_t * const full_mid= full + SIZE*2;\
2501     uint8_t halfH[SIZE*SIZE];\
2502     uint8_t halfV[SIZE*SIZE];\
2503     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2505     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2506     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2507 }\
2508 \
2509 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2510     uint8_t full[SIZE*(SIZE+5)];\
2511     uint8_t * const full_mid= full + SIZE*2;\
2512     uint8_t halfH[SIZE*SIZE];\
2513     uint8_t halfV[SIZE*SIZE];\
2514     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2516     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2517     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2518 }\
2519 \
2520 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2521     uint8_t full[SIZE*(SIZE+5)];\
2522     uint8_t * const full_mid= full + SIZE*2;\
2523     uint8_t halfH[SIZE*SIZE];\
2524     uint8_t halfV[SIZE*SIZE];\
2525     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2526     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2527     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2529 }\
2530 \
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2532     int16_t tmp[SIZE*(SIZE+5)];\
2533     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2534 }\
2535 \
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2537     int16_t tmp[SIZE*(SIZE+5)];\
2538     uint8_t halfH[SIZE*SIZE];\
2539     uint8_t halfHV[SIZE*SIZE];\
2540     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2541     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2543 }\
2544 \
2545 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2546     int16_t tmp[SIZE*(SIZE+5)];\
2547     uint8_t halfH[SIZE*SIZE];\
2548     uint8_t halfHV[SIZE*SIZE];\
2549     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2550     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2551     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2552 }\
2553 \
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2555     uint8_t full[SIZE*(SIZE+5)];\
2556     uint8_t * const full_mid= full + SIZE*2;\
2557     int16_t tmp[SIZE*(SIZE+5)];\
2558     uint8_t halfV[SIZE*SIZE];\
2559     uint8_t halfHV[SIZE*SIZE];\
2560     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
2561     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2562     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2563     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2564 }\
2565 \
2566 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2567     uint8_t full[SIZE*(SIZE+5)];\
2568     uint8_t * const full_mid= full + SIZE*2;\
2569     int16_t tmp[SIZE*(SIZE+5)];\
2570     uint8_t halfV[SIZE*SIZE];\
2571     uint8_t halfHV[SIZE*SIZE];\
2572     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
2573     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2574     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2575     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2576 }\
2577
2578 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2579 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2580 #define op_put(a, b)  a = cm[((b) + 16)>>5]
2581 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2582 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
2583
2584 H264_LOWPASS(put_       , op_put, op2_put)
2585 H264_LOWPASS(avg_       , op_avg, op2_avg)
2586 H264_MC(put_, 2)
2587 H264_MC(put_, 4)
2588 H264_MC(put_, 8)
2589 H264_MC(put_, 16)
2590 H264_MC(avg_, 4)
2591 H264_MC(avg_, 8)
2592 H264_MC(avg_, 16)
2593
2594 #undef op_avg
2595 #undef op_put
2596 #undef op2_avg
2597 #undef op2_put
2598 #endif
2599
2600 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2601     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2602     int i;
2603
2604     for(i=0; i<h; i++){
2605         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2606         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2607         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2608         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2609         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2610         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2611         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2612         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2613         dst+=dstStride;
2614         src+=srcStride;
2615     }
2616 }
2617
2618 #if CONFIG_CAVS_DECODER
2619 /* AVS specific */
2620 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2621     put_pixels8_c(dst, src, stride, 8);
2622 }
2623 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2624     avg_pixels8_c(dst, src, stride, 8);
2625 }
2626 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2627     put_pixels16_c(dst, src, stride, 16);
2628 }
2629 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2630     avg_pixels16_c(dst, src, stride, 16);
2631 }
2632 #endif /* CONFIG_CAVS_DECODER */
2633
2634 #if CONFIG_VC1_DECODER
2635 /* VC-1 specific */
2636 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2637     put_pixels8_c(dst, src, stride, 8);
2638 }
2639 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2640     avg_pixels8_c(dst, src, stride, 8);
2641 }
2642 #endif /* CONFIG_VC1_DECODER */
2643
2644 #if CONFIG_RV40_DECODER
2645 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2646     put_pixels16_xy2_c(dst, src, stride, 16);
2647 }
2648 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2649     avg_pixels16_xy2_c(dst, src, stride, 16);
2650 }
2651 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2652     put_pixels8_xy2_c(dst, src, stride, 8);
2653 }
2654 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2655     avg_pixels8_xy2_c(dst, src, stride, 8);
2656 }
2657 #endif /* CONFIG_RV40_DECODER */
2658
2659 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2661     int i;
2662
2663     for(i=0; i<w; i++){
2664         const int src_1= src[ -srcStride];
2665         const int src0 = src[0          ];
2666         const int src1 = src[  srcStride];
2667         const int src2 = src[2*srcStride];
2668         const int src3 = src[3*srcStride];
2669         const int src4 = src[4*srcStride];
2670         const int src5 = src[5*srcStride];
2671         const int src6 = src[6*srcStride];
2672         const int src7 = src[7*srcStride];
2673         const int src8 = src[8*srcStride];
2674         const int src9 = src[9*srcStride];
2675         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2676         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
2677         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
2678         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
2679         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
2680         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
2681         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
2682         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
2683         src++;
2684         dst++;
2685     }
2686 }
2687
2688 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2689     put_pixels8_c(dst, src, stride, 8);
2690 }
2691
2692 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2693     uint8_t half[64];
2694     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2695     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2696 }
2697
2698 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2699     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2700 }
2701
2702 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2703     uint8_t half[64];
2704     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2705     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2706 }
2707
2708 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2709     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2710 }
2711
2712 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2713     uint8_t halfH[88];
2714     uint8_t halfV[64];
2715     uint8_t halfHV[64];
2716     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2717     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2718     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2719     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2720 }
2721 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2722     uint8_t halfH[88];
2723     uint8_t halfV[64];
2724     uint8_t halfHV[64];
2725     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2726     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2727     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2728     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2729 }
2730 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2731     uint8_t halfH[88];
2732     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2733     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2734 }
2735
2736 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2737     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2738     int x;
2739     const int strength= ff_h263_loop_filter_strength[qscale];
2740
2741     for(x=0; x<8; x++){
2742         int d1, d2, ad1;
2743         int p0= src[x-2*stride];
2744         int p1= src[x-1*stride];
2745         int p2= src[x+0*stride];
2746         int p3= src[x+1*stride];
2747         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2748
2749         if     (d<-2*strength) d1= 0;
2750         else if(d<-  strength) d1=-2*strength - d;
2751         else if(d<   strength) d1= d;
2752         else if(d< 2*strength) d1= 2*strength - d;
2753         else                   d1= 0;
2754
2755         p1 += d1;
2756         p2 -= d1;
2757         if(p1&256) p1= ~(p1>>31);
2758         if(p2&256) p2= ~(p2>>31);
2759
2760         src[x-1*stride] = p1;
2761         src[x+0*stride] = p2;
2762
2763         ad1= FFABS(d1)>>1;
2764
2765         d2= av_clip((p0-p3)/4, -ad1, ad1);
2766
2767         src[x-2*stride] = p0 - d2;
2768         src[x+  stride] = p3 + d2;
2769     }
2770     }
2771 }
2772
2773 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2774     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2775     int y;
2776     const int strength= ff_h263_loop_filter_strength[qscale];
2777
2778     for(y=0; y<8; y++){
2779         int d1, d2, ad1;
2780         int p0= src[y*stride-2];
2781         int p1= src[y*stride-1];
2782         int p2= src[y*stride+0];
2783         int p3= src[y*stride+1];
2784         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2785
2786         if     (d<-2*strength) d1= 0;
2787         else if(d<-  strength) d1=-2*strength - d;
2788         else if(d<   strength) d1= d;
2789         else if(d< 2*strength) d1= 2*strength - d;
2790         else                   d1= 0;
2791
2792         p1 += d1;
2793         p2 -= d1;
2794         if(p1&256) p1= ~(p1>>31);
2795         if(p2&256) p2= ~(p2>>31);
2796
2797         src[y*stride-1] = p1;
2798         src[y*stride+0] = p2;
2799
2800         ad1= FFABS(d1)>>1;
2801
2802         d2= av_clip((p0-p3)/4, -ad1, ad1);
2803
2804         src[y*stride-2] = p0 - d2;
2805         src[y*stride+1] = p3 + d2;
2806     }
2807     }
2808 }
2809
2810 static void h261_loop_filter_c(uint8_t *src, int stride){
2811     int x,y,xy,yz;
2812     int temp[64];
2813
2814     for(x=0; x<8; x++){
2815         temp[x      ] = 4*src[x           ];
2816         temp[x + 7*8] = 4*src[x + 7*stride];
2817     }
2818     for(y=1; y<7; y++){
2819         for(x=0; x<8; x++){
2820             xy = y * stride + x;
2821             yz = y * 8 + x;
2822             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2823         }
2824     }
2825
2826     for(y=0; y<8; y++){
2827         src[  y*stride] = (temp[  y*8] + 2)>>2;
2828         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2829         for(x=1; x<7; x++){
2830             xy = y * stride + x;
2831             yz = y * 8 + x;
2832             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2833         }
2834     }
2835 }
2836
2837 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2838 {
2839     int s, i;
2840
2841     s = 0;
2842     for(i=0;i<h;i++) {
2843         s += abs(pix1[0] - pix2[0]);
2844         s += abs(pix1[1] - pix2[1]);
2845         s += abs(pix1[2] - pix2[2]);
2846         s += abs(pix1[3] - pix2[3]);
2847         s += abs(pix1[4] - pix2[4]);
2848         s += abs(pix1[5] - pix2[5]);
2849         s += abs(pix1[6] - pix2[6]);
2850         s += abs(pix1[7] - pix2[7]);
2851         s += abs(pix1[8] - pix2[8]);
2852         s += abs(pix1[9] - pix2[9]);
2853         s += abs(pix1[10] - pix2[10]);
2854         s += abs(pix1[11] - pix2[11]);
2855         s += abs(pix1[12] - pix2[12]);
2856         s += abs(pix1[13] - pix2[13]);
2857         s += abs(pix1[14] - pix2[14]);
2858         s += abs(pix1[15] - pix2[15]);
2859         pix1 += line_size;
2860         pix2 += line_size;
2861     }
2862     return s;
2863 }
2864
2865 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2866 {
2867     int s, i;
2868
2869     s = 0;
2870     for(i=0;i<h;i++) {
2871         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2872         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2873         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2874         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2875         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2876         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2877         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2878         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2879         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2880         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2881         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2882         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2883         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2884         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2885         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2886         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2887         pix1 += line_size;
2888         pix2 += line_size;
2889     }
2890     return s;
2891 }
2892
2893 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894 {
2895     int s, i;
2896     uint8_t *pix3 = pix2 + line_size;
2897
2898     s = 0;
2899     for(i=0;i<h;i++) {
2900         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2901         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2902         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2903         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2904         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2905         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2906         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2907         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2908         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2909         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2910         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2911         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2912         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2913         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2914         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2915         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2916         pix1 += line_size;
2917         pix2 += line_size;
2918         pix3 += line_size;
2919     }
2920     return s;
2921 }
2922
2923 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2924 {
2925     int s, i;
2926     uint8_t *pix3 = pix2 + line_size;
2927
2928     s = 0;
2929     for(i=0;i<h;i++) {
2930         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2931         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2932         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2933         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2934         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2935         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2936         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2937         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2938         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2939         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2940         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2941         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2942         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2943         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2944         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2945         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2946         pix1 += line_size;
2947         pix2 += line_size;
2948         pix3 += line_size;
2949     }
2950     return s;
2951 }
2952
2953 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2954 {
2955     int s, i;
2956
2957     s = 0;
2958     for(i=0;i<h;i++) {
2959         s += abs(pix1[0] - pix2[0]);
2960         s += abs(pix1[1] - pix2[1]);
2961         s += abs(pix1[2] - pix2[2]);
2962         s += abs(pix1[3] - pix2[3]);
2963         s += abs(pix1[4] - pix2[4]);
2964         s += abs(pix1[5] - pix2[5]);
2965         s += abs(pix1[6] - pix2[6]);
2966         s += abs(pix1[7] - pix2[7]);
2967         pix1 += line_size;
2968         pix2 += line_size;
2969     }
2970     return s;
2971 }
2972
2973 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974 {
2975     int s, i;
2976
2977     s = 0;
2978     for(i=0;i<h;i++) {
2979         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2980         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2981         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2982         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2983         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2984         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2985         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2986         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2987         pix1 += line_size;
2988         pix2 += line_size;
2989     }
2990     return s;
2991 }
2992
2993 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994 {
2995     int s, i;
2996     uint8_t *pix3 = pix2 + line_size;
2997
2998     s = 0;
2999     for(i=0;i<h;i++) {
3000         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3001         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3002         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3003         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3004         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3005         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3006         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3007         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3008         pix1 += line_size;
3009         pix2 += line_size;
3010         pix3 += line_size;
3011     }
3012     return s;
3013 }
3014
3015 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3016 {
3017     int s, i;
3018     uint8_t *pix3 = pix2 + line_size;
3019
3020     s = 0;
3021     for(i=0;i<h;i++) {
3022         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3023         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3024         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3025         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3026         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3027         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3028         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3029         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3030         pix1 += line_size;
3031         pix2 += line_size;
3032         pix3 += line_size;
3033     }
3034     return s;
3035 }
3036
3037 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3038     MpegEncContext *c = v;
3039     int score1=0;
3040     int score2=0;
3041     int x,y;
3042
3043     for(y=0; y<h; y++){
3044         for(x=0; x<16; x++){
3045             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3046         }
3047         if(y+1<h){
3048             for(x=0; x<15; x++){
3049                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3050                              - s1[x+1] + s1[x+1+stride])
3051                         -FFABS(  s2[x  ] - s2[x  +stride]
3052                              - s2[x+1] + s2[x+1+stride]);
3053             }
3054         }
3055         s1+= stride;
3056         s2+= stride;
3057     }
3058
3059     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3060     else  return score1 + FFABS(score2)*8;
3061 }
3062
3063 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3064     MpegEncContext *c = v;
3065     int score1=0;
3066     int score2=0;
3067     int x,y;
3068
3069     for(y=0; y<h; y++){
3070         for(x=0; x<8; x++){
3071             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
3072         }
3073         if(y+1<h){
3074             for(x=0; x<7; x++){
3075                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
3076                              - s1[x+1] + s1[x+1+stride])
3077                         -FFABS(  s2[x  ] - s2[x  +stride]
3078                              - s2[x+1] + s2[x+1+stride]);
3079             }
3080         }
3081         s1+= stride;
3082         s2+= stride;
3083     }
3084
3085     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3086     else  return score1 + FFABS(score2)*8;
3087 }
3088
3089 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3090     int i;
3091     unsigned int sum=0;
3092
3093     for(i=0; i<8*8; i++){
3094         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3095         int w= weight[i];
3096         b>>= RECON_SHIFT;
3097         assert(-512<b && b<512);
3098
3099         sum += (w*b)*(w*b)>>4;
3100     }
3101     return sum>>2;
3102 }
3103
3104 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3105     int i;
3106
3107     for(i=0; i<8*8; i++){
3108         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3109     }
3110 }
3111
3112 /**
3113  * permutes an 8x8 block.
3114  * @param block the block which will be permuted according to the given permutation vector
3115  * @param permutation the permutation vector
3116  * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3117  * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3118  *                  (inverse) permutated to scantable order!
3119  */
3120 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3121 {
3122     int i;
3123     DCTELEM temp[64];
3124
3125     if(last<=0) return;
3126     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3127
3128     for(i=0; i<=last; i++){
3129         const int j= scantable[i];
3130         temp[j]= block[j];
3131         block[j]=0;
3132     }
3133
3134     for(i=0; i<=last; i++){
3135         const int j= scantable[i];
3136         const int perm_j= permutation[j];
3137         block[perm_j]= temp[j];
3138     }
3139 }
3140
3141 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3142     return 0;
3143 }
3144
3145 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3146     int i;
3147
3148     memset(cmp, 0, sizeof(void*)*6);
3149
3150     for(i=0; i<6; i++){
3151         switch(type&0xFF){
3152         case FF_CMP_SAD:
3153             cmp[i]= c->sad[i];
3154             break;
3155         case FF_CMP_SATD:
3156             cmp[i]= c->hadamard8_diff[i];
3157             break;
3158         case FF_CMP_SSE:
3159             cmp[i]= c->sse[i];
3160             break;
3161         case FF_CMP_DCT:
3162             cmp[i]= c->dct_sad[i];
3163             break;
3164         case FF_CMP_DCT264:
3165             cmp[i]= c->dct264_sad[i];
3166             break;
3167         case FF_CMP_DCTMAX:
3168             cmp[i]= c->dct_max[i];
3169             break;
3170         case FF_CMP_PSNR:
3171             cmp[i]= c->quant_psnr[i];
3172             break;
3173         case FF_CMP_BIT:
3174             cmp[i]= c->bit[i];
3175             break;
3176         case FF_CMP_RD:
3177             cmp[i]= c->rd[i];
3178             break;
3179         case FF_CMP_VSAD:
3180             cmp[i]= c->vsad[i];
3181             break;
3182         case FF_CMP_VSSE:
3183             cmp[i]= c->vsse[i];
3184             break;
3185         case FF_CMP_ZERO:
3186             cmp[i]= zero_cmp;
3187             break;
3188         case FF_CMP_NSSE:
3189             cmp[i]= c->nsse[i];
3190             break;
3191 #if CONFIG_DWT
3192         case FF_CMP_W53:
3193             cmp[i]= c->w53[i];
3194             break;
3195         case FF_CMP_W97:
3196             cmp[i]= c->w97[i];
3197             break;
3198 #endif
3199         default:
3200             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3201         }
3202     }
3203 }
3204
3205 static void clear_block_c(DCTELEM *block)
3206 {
3207     memset(block, 0, sizeof(DCTELEM)*64);
3208 }
3209
3210 /**
3211  * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3212  */
3213 static void clear_blocks_c(DCTELEM *blocks)
3214 {
3215     memset(blocks, 0, sizeof(DCTELEM)*6*64);
3216 }
3217
3218 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3219     long i;
3220     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3221         long a = *(long*)(src+i);
3222         long b = *(long*)(dst+i);
3223         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3224     }
3225     for(; i<w; i++)
3226         dst[i+0] += src[i+0];
3227 }
3228
3229 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3230     long i;
3231     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3232         long a = *(long*)(src1+i);
3233         long b = *(long*)(src2+i);
3234         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3235     }
3236     for(; i<w; i++)
3237         dst[i] = src1[i]+src2[i];
3238 }
3239
3240 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3241     long i;
3242 #if !HAVE_FAST_UNALIGNED
3243     if((long)src2 & (sizeof(long)-1)){
3244         for(i=0; i+7<w; i+=8){
3245             dst[i+0] = src1[i+0]-src2[i+0];
3246             dst[i+1] = src1[i+1]-src2[i+1];
3247             dst[i+2] = src1[i+2]-src2[i+2];
3248             dst[i+3] = src1[i+3]-src2[i+3];
3249             dst[i+4] = src1[i+4]-src2[i+4];
3250             dst[i+5] = src1[i+5]-src2[i+5];
3251             dst[i+6] = src1[i+6]-src2[i+6];
3252             dst[i+7] = src1[i+7]-src2[i+7];
3253         }
3254     }else
3255 #endif
3256     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3257         long a = *(long*)(src1+i);
3258         long b = *(long*)(src2+i);
3259         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3260     }
3261     for(; i<w; i++)
3262         dst[i+0] = src1[i+0]-src2[i+0];
3263 }
3264
3265 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3266     int i;
3267     uint8_t l, lt;
3268
3269     l= *left;
3270     lt= *left_top;
3271
3272     for(i=0; i<w; i++){
3273         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3274         lt= src1[i];
3275         dst[i]= l;
3276     }
3277
3278     *left= l;
3279     *left_top= lt;
3280 }
3281
3282 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3283     int i;
3284     uint8_t l, lt;
3285
3286     l= *left;
3287     lt= *left_top;
3288
3289     for(i=0; i<w; i++){
3290         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3291         lt= src1[i];
3292         l= src2[i];
3293         dst[i]= l - pred;
3294     }
3295
3296     *left= l;
3297     *left_top= lt;
3298 }
3299
3300 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3301     int i;
3302
3303     for(i=0; i<w-1; i++){
3304         acc+= src[i];
3305         dst[i]= acc;
3306         i++;
3307         acc+= src[i];
3308         dst[i]= acc;
3309     }
3310
3311     for(; i<w; i++){
3312         acc+= src[i];
3313         dst[i]= acc;
3314     }
3315
3316     return acc;
3317 }
3318
3319 #if HAVE_BIGENDIAN
3320 #define B 3
3321 #define G 2
3322 #define R 1
3323 #define A 0
3324 #else
3325 #define B 0
3326 #define G 1
3327 #define R 2
3328 #define A 3
3329 #endif
3330 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3331     int i;
3332     int r,g,b,a;
3333     r= *red;
3334     g= *green;
3335     b= *blue;
3336     a= *alpha;
3337
3338     for(i=0; i<w; i++){
3339         b+= src[4*i+B];
3340         g+= src[4*i+G];
3341         r+= src[4*i+R];
3342         a+= src[4*i+A];
3343
3344         dst[4*i+B]= b;
3345         dst[4*i+G]= g;
3346         dst[4*i+R]= r;
3347         dst[4*i+A]= a;
3348     }
3349
3350     *red= r;
3351     *green= g;
3352     *blue= b;
3353     *alpha= a;
3354 }
3355 #undef B
3356 #undef G
3357 #undef R
3358 #undef A
3359
3360 #define BUTTERFLY2(o1,o2,i1,i2) \
3361 o1= (i1)+(i2);\
3362 o2= (i1)-(i2);
3363
3364 #define BUTTERFLY1(x,y) \
3365 {\
3366     int a,b;\
3367     a= x;\
3368     b= y;\
3369     x= a+b;\
3370     y= a-b;\
3371 }
3372
3373 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3374
3375 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3376     int i;
3377     int temp[64];
3378     int sum=0;
3379
3380     assert(h==8);
3381
3382     for(i=0; i<8; i++){
3383         //FIXME try pointer walks
3384         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3385         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3386         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3387         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3388
3389         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3390         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3391         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3392         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3393
3394         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3395         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3396         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3397         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3398     }
3399
3400     for(i=0; i<8; i++){
3401         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3402         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3403         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3404         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3405
3406         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3407         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3408         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3409         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3410
3411         sum +=
3412              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3413             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3414             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3415             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3416     }
3417 #if 0
3418 static int maxi=0;
3419 if(sum>maxi){
3420     maxi=sum;
3421     printf("MAX:%d\n", maxi);
3422 }
3423 #endif
3424     return sum;
3425 }
3426
3427 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3428     int i;
3429     int temp[64];
3430     int sum=0;
3431
3432     assert(h==8);
3433
3434     for(i=0; i<8; i++){
3435         //FIXME try pointer walks
3436         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3437         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3438         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3439         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3440
3441         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3442         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3443         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3444         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3445
3446         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3447         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3448         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3449         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3450     }
3451
3452     for(i=0; i<8; i++){
3453         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3454         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3455         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3456         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3457
3458         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3459         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3460         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3461         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3462
3463         sum +=
3464              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3465             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3466             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3467             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3468     }
3469
3470     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3471
3472     return sum;
3473 }
3474
3475 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3476     MpegEncContext * const s= (MpegEncContext *)c;
3477     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3478
3479     assert(h==8);
3480
3481     s->dsp.diff_pixels(temp, src1, src2, stride);
3482     s->dsp.fdct(temp);
3483     return s->dsp.sum_abs_dctelem(temp);
3484 }
3485
3486 #if CONFIG_GPL
3487 #define DCT8_1D {\
3488     const int s07 = SRC(0) + SRC(7);\
3489     const int s16 = SRC(1) + SRC(6);\
3490     const int s25 = SRC(2) + SRC(5);\
3491     const int s34 = SRC(3) + SRC(4);\
3492     const int a0 = s07 + s34;\
3493     const int a1 = s16 + s25;\
3494     const int a2 = s07 - s34;\
3495     const int a3 = s16 - s25;\
3496     const int d07 = SRC(0) - SRC(7);\
3497     const int d16 = SRC(1) - SRC(6);\
3498     const int d25 = SRC(2) - SRC(5);\
3499     const int d34 = SRC(3) - SRC(4);\
3500     const int a4 = d16 + d25 + (d07 + (d07>>1));\
3501     const int a5 = d07 - d34 - (d25 + (d25>>1));\
3502     const int a6 = d07 + d34 - (d16 + (d16>>1));\
3503     const int a7 = d16 - d25 + (d34 + (d34>>1));\
3504     DST(0,  a0 + a1     ) ;\
3505     DST(1,  a4 + (a7>>2)) ;\
3506     DST(2,  a2 + (a3>>1)) ;\
3507     DST(3,  a5 + (a6>>2)) ;\
3508     DST(4,  a0 - a1     ) ;\
3509     DST(5,  a6 - (a5>>2)) ;\
3510     DST(6, (a2>>1) - a3 ) ;\
3511     DST(7, (a4>>2) - a7 ) ;\
3512 }
3513
3514 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3515     MpegEncContext * const s= (MpegEncContext *)c;
3516     DCTELEM dct[8][8];
3517     int i;
3518     int sum=0;
3519
3520     s->dsp.diff_pixels(dct[0], src1, src2, stride);
3521
3522 #define SRC(x) dct[i][x]
3523 #define DST(x,v) dct[i][x]= v
3524     for( i = 0; i < 8; i++ )
3525         DCT8_1D
3526 #undef SRC
3527 #undef DST
3528
3529 #define SRC(x) dct[x][i]
3530 #define DST(x,v) sum += FFABS(v)
3531     for( i = 0; i < 8; i++ )
3532         DCT8_1D
3533 #undef SRC
3534 #undef DST
3535     return sum;
3536 }
3537 #endif
3538
3539 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3540     MpegEncContext * const s= (MpegEncContext *)c;
3541     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3542     int sum=0, i;
3543
3544     assert(h==8);
3545
3546     s->dsp.diff_pixels(temp, src1, src2, stride);
3547     s->dsp.fdct(temp);
3548
3549     for(i=0; i<64; i++)
3550         sum= FFMAX(sum, FFABS(temp[i]));
3551
3552     return sum;
3553 }
3554
3555 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3556     MpegEncContext * const s= (MpegEncContext *)c;
3557     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3558     DCTELEM * const bak = temp+64;
3559     int sum=0, i;
3560
3561     assert(h==8);
3562     s->mb_intra=0;
3563
3564     s->dsp.diff_pixels(temp, src1, src2, stride);
3565
3566     memcpy(bak, temp, 64*sizeof(DCTELEM));
3567
3568     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3569     s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570     ff_simple_idct(temp); //FIXME
3571
3572     for(i=0; i<64; i++)
3573         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3574
3575     return sum;
3576 }
3577
3578 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3579     MpegEncContext * const s= (MpegEncContext *)c;
3580     const uint8_t *scantable= s->intra_scantable.permutated;
3581     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3582     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3583     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3584     int i, last, run, bits, level, distortion, start_i;
3585     const int esc_length= s->ac_esc_length;
3586     uint8_t * length;
3587     uint8_t * last_length;
3588
3589     assert(h==8);
3590
3591     copy_block8(lsrc1, src1, 8, stride, 8);
3592     copy_block8(lsrc2, src2, 8, stride, 8);
3593
3594     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3595
3596     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3597
3598     bits=0;
3599
3600     if (s->mb_intra) {
3601         start_i = 1;
3602         length     = s->intra_ac_vlc_length;
3603         last_length= s->intra_ac_vlc_last_length;
3604         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3605     } else {
3606         start_i = 0;
3607         length     = s->inter_ac_vlc_length;
3608         last_length= s->inter_ac_vlc_last_length;
3609     }
3610
3611     if(last>=start_i){
3612         run=0;
3613         for(i=start_i; i<last; i++){
3614             int j= scantable[i];
3615             level= temp[j];
3616
3617             if(level){
3618                 level+=64;
3619                 if((level&(~127)) == 0){
3620                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3621                 }else
3622                     bits+= esc_length;
3623                 run=0;
3624             }else
3625                 run++;
3626         }
3627         i= scantable[last];
3628
3629         level= temp[i] + 64;
3630
3631         assert(level - 64);
3632
3633         if((level&(~127)) == 0){
3634             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3635         }else
3636             bits+= esc_length;
3637
3638     }
3639
3640     if(last>=0){
3641         if(s->mb_intra)
3642             s->dct_unquantize_intra(s, temp, 0, s->qscale);
3643         else
3644             s->dct_unquantize_inter(s, temp, 0, s->qscale);
3645     }
3646
3647     s->dsp.idct_add(lsrc2, 8, temp);
3648
3649     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3650
3651     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3652 }
3653
3654 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3655     MpegEncContext * const s= (MpegEncContext *)c;
3656     const uint8_t *scantable= s->intra_scantable.permutated;
3657     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3658     int i, last, run, bits, level, start_i;
3659     const int esc_length= s->ac_esc_length;
3660     uint8_t * length;
3661     uint8_t * last_length;
3662
3663     assert(h==8);
3664
3665     s->dsp.diff_pixels(temp, src1, src2, stride);
3666
3667     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3668
3669     bits=0;
3670
3671     if (s->mb_intra) {
3672         start_i = 1;
3673         length     = s->intra_ac_vlc_length;
3674         last_length= s->intra_ac_vlc_last_length;
3675         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3676     } else {
3677         start_i = 0;
3678         length     = s->inter_ac_vlc_length;
3679         last_length= s->inter_ac_vlc_last_length;
3680     }
3681
3682     if(last>=start_i){
3683         run=0;
3684         for(i=start_i; i<last; i++){
3685             int j= scantable[i];
3686             level= temp[j];
3687
3688             if(level){
3689                 level+=64;
3690                 if((level&(~127)) == 0){
3691                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
3692                 }else
3693                     bits+= esc_length;
3694                 run=0;
3695             }else
3696                 run++;
3697         }
3698         i= scantable[last];
3699
3700         level= temp[i] + 64;
3701
3702         assert(level - 64);
3703
3704         if((level&(~127)) == 0){
3705             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3706         }else
3707             bits+= esc_length;
3708     }
3709
3710     return bits;
3711 }
3712
3713 #define VSAD_INTRA(size) \
3714 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3715     int score=0;                                                                                            \
3716     int x,y;                                                                                                \
3717                                                                                                             \
3718     for(y=1; y<h; y++){                                                                                     \
3719         for(x=0; x<size; x+=4){                                                                             \
3720             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
3721                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
3722         }                                                                                                   \
3723         s+= stride;                                                                                         \
3724     }                                                                                                       \
3725                                                                                                             \
3726     return score;                                                                                           \
3727 }
3728 VSAD_INTRA(8)
3729 VSAD_INTRA(16)
3730
3731 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3732     int score=0;
3733     int x,y;
3734
3735     for(y=1; y<h; y++){
3736         for(x=0; x<16; x++){
3737             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3738         }
3739         s1+= stride;
3740         s2+= stride;
3741     }
3742
3743     return score;
3744 }
3745
3746 #define SQ(a) ((a)*(a))
3747 #define VSSE_INTRA(size) \
3748 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3749     int score=0;                                                                                            \
3750     int x,y;                                                                                                \
3751                                                                                                             \
3752     for(y=1; y<h; y++){                                                                                     \
3753         for(x=0; x<size; x+=4){                                                                               \
3754             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
3755                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
3756         }                                                                                                   \
3757         s+= stride;                                                                                         \
3758     }                                                                                                       \
3759                                                                                                             \
3760     return score;                                                                                           \
3761 }
3762 VSSE_INTRA(8)
3763 VSSE_INTRA(16)
3764
3765 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3766     int score=0;
3767     int x,y;
3768
3769     for(y=1; y<h; y++){
3770         for(x=0; x<16; x++){
3771             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
3772         }
3773         s1+= stride;
3774         s2+= stride;
3775     }
3776
3777     return score;
3778 }
3779
3780 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3781                                int size){
3782     int score=0;
3783     int i;
3784     for(i=0; i<size; i++)
3785         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3786     return score;
3787 }
3788
3789 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3790 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3791 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3792 #if CONFIG_GPL
3793 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3794 #endif
3795 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3796 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3797 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3798 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3799
3800 static void vector_fmul_c(float *dst, const float *src, int len){
3801     int i;
3802     for(i=0; i<len; i++)
3803         dst[i] *= src[i];
3804 }
3805
3806 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3807     int i;
3808     src1 += len-1;
3809     for(i=0; i<len; i++)
3810         dst[i] = src0[i] * src1[-i];
3811 }
3812
3813 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3814     int i;
3815     for(i=0; i<len; i++)
3816         dst[i] = src0[i] * src1[i] + src2[i];
3817 }
3818
3819 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3820     int i,j;
3821     dst += len;
3822     win += len;
3823     src0+= len;
3824     for(i=-len, j=len-1; i<0; i++, j--) {
3825         float s0 = src0[i];
3826         float s1 = src1[j];
3827         float wi = win[i];
3828         float wj = win[j];
3829         dst[i] = s0*wj - s1*wi + add_bias;
3830         dst[j] = s0*wi + s1*wj + add_bias;
3831     }
3832 }
3833
3834 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3835                                  int len)
3836 {
3837     int i;
3838     for (i = 0; i < len; i++)
3839         dst[i] = src[i] * mul;
3840 }
3841
3842 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3843                                       const float **sv, float mul, int len)
3844 {
3845     int i;
3846     for (i = 0; i < len; i += 2, sv++) {
3847         dst[i  ] = src[i  ] * sv[0][0] * mul;
3848         dst[i+1] = src[i+1] * sv[0][1] * mul;
3849     }
3850 }
3851
3852 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3853                                       const float **sv, float mul, int len)
3854 {
3855     int i;
3856     for (i = 0; i < len; i += 4, sv++) {
3857         dst[i  ] = src[i  ] * sv[0][0] * mul;
3858         dst[i+1] = src[i+1] * sv[0][1] * mul;
3859         dst[i+2] = src[i+2] * sv[0][2] * mul;
3860         dst[i+3] = src[i+3] * sv[0][3] * mul;
3861     }
3862 }
3863
3864 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3865                                int len)
3866 {
3867     int i;
3868     for (i = 0; i < len; i += 2, sv++) {
3869         dst[i  ] = sv[0][0] * mul;
3870         dst[i+1] = sv[0][1] * mul;
3871     }
3872 }
3873
3874 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3875                                int len)
3876 {
3877     int i;
3878     for (i = 0; i < len; i += 4, sv++) {
3879         dst[i  ] = sv[0][0] * mul;
3880         dst[i+1] = sv[0][1] * mul;
3881         dst[i+2] = sv[0][2] * mul;
3882         dst[i+3] = sv[0][3] * mul;
3883     }
3884 }
3885
3886 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3887                                 int len)
3888 {
3889     int i;
3890     for (i = 0; i < len; i++) {
3891         float t = v1[i] - v2[i];
3892         v1[i] += v2[i];
3893         v2[i] = t;
3894     }
3895 }
3896
3897 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3898 {
3899     float p = 0.0;
3900     int i;
3901
3902     for (i = 0; i < len; i++)
3903         p += v1[i] * v2[i];
3904
3905     return p;
3906 }
3907
3908 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3909     int i;
3910     for(i=0; i<len; i++)
3911         dst[i] = src[i] * mul;
3912 }
3913
3914 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3915                    uint32_t maxi, uint32_t maxisign)
3916 {
3917
3918     if(a > mini) return mini;
3919     else if((a^(1<<31)) > maxisign) return maxi;
3920     else return a;
3921 }
3922
3923 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3924     int i;
3925     uint32_t mini = *(uint32_t*)min;
3926     uint32_t maxi = *(uint32_t*)max;
3927     uint32_t maxisign = maxi ^ (1<<31);
3928     uint32_t *dsti = (uint32_t*)dst;
3929     const uint32_t *srci = (const uint32_t*)src;
3930     for(i=0; i<len; i+=8) {
3931         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3932         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3933         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3934         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3935         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3936         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3937         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3938         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3939     }
3940 }
3941 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3942     int i;
3943     if(min < 0 && max > 0) {
3944         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3945     } else {
3946         for(i=0; i < len; i+=8) {
3947             dst[i    ] = av_clipf(src[i    ], min, max);
3948             dst[i + 1] = av_clipf(src[i + 1], min, max);
3949             dst[i + 2] = av_clipf(src[i + 2], min, max);
3950             dst[i + 3] = av_clipf(src[i + 3], min, max);
3951             dst[i + 4] = av_clipf(src[i + 4], min, max);
3952             dst[i + 5] = av_clipf(src[i + 5], min, max);
3953             dst[i + 6] = av_clipf(src[i + 6], min, max);
3954             dst[i + 7] = av_clipf(src[i + 7], min, max);
3955         }
3956     }
3957 }
3958
3959 static av_always_inline int float_to_int16_one(const float *src){
3960     int_fast32_t tmp = *(const int32_t*)src;
3961     if(tmp & 0xf0000){
3962         tmp = (0x43c0ffff - tmp)>>31;
3963         // is this faster on some gcc/cpu combinations?
3964 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3965 //      else                 tmp = 0;
3966     }
3967     return tmp - 0x8000;
3968 }
3969
3970 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3971     int i;
3972     for(i=0; i<len; i++)
3973         dst[i] = float_to_int16_one(src+i);
3974 }
3975
3976 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3977     int i,j,c;
3978     if(channels==2){
3979         for(i=0; i<len; i++){
3980             dst[2*i]   = float_to_int16_one(src[0]+i);
3981             dst[2*i+1] = float_to_int16_one(src[1]+i);
3982         }
3983     }else{
3984         for(c=0; c<channels; c++)
3985             for(i=0, j=c; i<len; i++, j+=channels)
3986                 dst[j] = float_to_int16_one(src[c]+i);
3987     }
3988 }
3989
3990 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
3991 {
3992     int res = 0;
3993
3994     while (order--)
3995         res += (*v1++ * *v2++) >> shift;
3996
3997     return res;
3998 }
3999
4000 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4001 {
4002     int res = 0;
4003     while (order--) {
4004         res   += *v1 * *v2++;
4005         *v1++ += mul * *v3++;
4006     }
4007     return res;
4008 }
4009
4010 #define W0 2048
4011 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
4018
4019 static void wmv2_idct_row(short * b)
4020 {
4021     int s1,s2;
4022     int a0,a1,a2,a3,a4,a5,a6,a7;
4023     /*step 1*/
4024     a1 = W1*b[1]+W7*b[7];
4025     a7 = W7*b[1]-W1*b[7];
4026     a5 = W5*b[5]+W3*b[3];
4027     a3 = W3*b[5]-W5*b[3];
4028     a2 = W2*b[2]+W6*b[6];
4029     a6 = W6*b[2]-W2*b[6];
4030     a0 = W0*b[0]+W0*b[4];
4031     a4 = W0*b[0]-W0*b[4];
4032     /*step 2*/
4033     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035     /*step 3*/
4036     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037     b[1] = (a4+a6 +s1   + (1<<7))>>8;
4038     b[2] = (a4-a6 +s2   + (1<<7))>>8;
4039     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041     b[5] = (a4-a6 -s2   + (1<<7))>>8;
4042     b[6] = (a4+a6 -s1   + (1<<7))>>8;
4043     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4044 }
4045 static void wmv2_idct_col(short * b)
4046 {
4047     int s1,s2;
4048     int a0,a1,a2,a3,a4,a5,a6,a7;
4049     /*step 1, with extended precision*/
4050     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
4057     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
4058     /*step 2*/
4059     s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060     s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061     /*step 3*/
4062     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
4064     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
4065     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4066
4067     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
4069     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
4070     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4071 }
4072 void ff_wmv2_idct_c(short * block){
4073     int i;
4074
4075     for(i=0;i<64;i+=8){
4076         wmv2_idct_row(block+i);
4077     }
4078     for(i=0;i<8;i++){
4079         wmv2_idct_col(block+i);
4080     }
4081 }
4082 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4083  converted */
4084 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4085 {
4086     ff_wmv2_idct_c(block);
4087     put_pixels_clamped_c(block, dest, line_size);
4088 }
4089 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4090 {
4091     ff_wmv2_idct_c(block);
4092     add_pixels_clamped_c(block, dest, line_size);
4093 }
4094 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4095 {
4096     j_rev_dct (block);
4097     put_pixels_clamped_c(block, dest, line_size);
4098 }
4099 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4100 {
4101     j_rev_dct (block);
4102     add_pixels_clamped_c(block, dest, line_size);
4103 }
4104
4105 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4106 {
4107     j_rev_dct4 (block);
4108     put_pixels_clamped4_c(block, dest, line_size);
4109 }
4110 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4111 {
4112     j_rev_dct4 (block);
4113     add_pixels_clamped4_c(block, dest, line_size);
4114 }
4115
4116 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4117 {
4118     j_rev_dct2 (block);
4119     put_pixels_clamped2_c(block, dest, line_size);
4120 }
4121 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4122 {
4123     j_rev_dct2 (block);
4124     add_pixels_clamped2_c(block, dest, line_size);
4125 }
4126
4127 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4128 {
4129     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4130
4131     dest[0] = cm[(block[0] + 4)>>3];
4132 }
4133 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4134 {
4135     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4136
4137     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4138 }
4139
4140 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4141
4142 /* init static data */
4143 av_cold void dsputil_static_init(void)
4144 {
4145     int i;
4146
4147     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148     for(i=0;i<MAX_NEG_CROP;i++) {
4149         ff_cropTbl[i] = 0;
4150         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4151     }
4152
4153     for(i=0;i<512;i++) {
4154         ff_squareTbl[i] = (i - 256) * (i - 256);
4155     }
4156
4157     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4158 }
4159
4160 int ff_check_alignment(void){
4161     static int did_fail=0;
4162     DECLARE_ALIGNED(16, int, aligned);
4163
4164     if((intptr_t)&aligned & 15){
4165         if(!did_fail){
4166 #if HAVE_MMX || HAVE_ALTIVEC
4167             av_log(NULL, AV_LOG_ERROR,
4168                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171                 "Do not report crashes to FFmpeg developers.\n");
4172 #endif
4173             did_fail=1;
4174         }
4175         return -1;
4176     }
4177     return 0;
4178 }
4179
4180 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4181 {
4182     int i;
4183
4184     ff_check_alignment();
4185
4186 #if CONFIG_ENCODERS
4187     if(avctx->dct_algo==FF_DCT_FASTINT) {
4188         c->fdct = fdct_ifast;
4189         c->fdct248 = fdct_ifast248;
4190     }
4191     else if(avctx->dct_algo==FF_DCT_FAAN) {
4192         c->fdct = ff_faandct;
4193         c->fdct248 = ff_faandct248;
4194     }
4195     else {
4196         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197         c->fdct248 = ff_fdct248_islow;
4198     }
4199 #endif //CONFIG_ENCODERS
4200
4201     if(avctx->lowres==1){
4202         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4203             c->idct_put= ff_jref_idct4_put;
4204             c->idct_add= ff_jref_idct4_add;
4205         }else{
4206             c->idct_put= ff_h264_lowres_idct_put_c;
4207             c->idct_add= ff_h264_lowres_idct_add_c;
4208         }
4209         c->idct    = j_rev_dct4;
4210         c->idct_permutation_type= FF_NO_IDCT_PERM;
4211     }else if(avctx->lowres==2){
4212         c->idct_put= ff_jref_idct2_put;
4213         c->idct_add= ff_jref_idct2_add;
4214         c->idct    = j_rev_dct2;
4215         c->idct_permutation_type= FF_NO_IDCT_PERM;
4216     }else if(avctx->lowres==3){
4217         c->idct_put= ff_jref_idct1_put;
4218         c->idct_add= ff_jref_idct1_add;
4219         c->idct    = j_rev_dct1;
4220         c->idct_permutation_type= FF_NO_IDCT_PERM;
4221     }else{
4222         if(avctx->idct_algo==FF_IDCT_INT){
4223             c->idct_put= ff_jref_idct_put;
4224             c->idct_add= ff_jref_idct_add;
4225             c->idct    = j_rev_dct;
4226             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4228                 avctx->idct_algo==FF_IDCT_VP3){
4229             c->idct_put= ff_vp3_idct_put_c;
4230             c->idct_add= ff_vp3_idct_add_c;
4231             c->idct    = ff_vp3_idct_c;
4232             c->idct_permutation_type= FF_NO_IDCT_PERM;
4233         }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234             c->idct_put= ff_wmv2_idct_put_c;
4235             c->idct_add= ff_wmv2_idct_add_c;
4236             c->idct    = ff_wmv2_idct_c;
4237             c->idct_permutation_type= FF_NO_IDCT_PERM;
4238         }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239             c->idct_put= ff_faanidct_put;
4240             c->idct_add= ff_faanidct_add;
4241             c->idct    = ff_faanidct;
4242             c->idct_permutation_type= FF_NO_IDCT_PERM;
4243         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4244             c->idct_put= ff_ea_idct_put_c;
4245             c->idct_permutation_type= FF_NO_IDCT_PERM;
4246         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4247             c->idct     = ff_bink_idct_c;
4248             c->idct_add = ff_bink_idct_add_c;
4249             c->idct_put = ff_bink_idct_put_c;
4250             c->idct_permutation_type = FF_NO_IDCT_PERM;
4251         }else{ //accurate/default
4252             c->idct_put= ff_simple_idct_put;
4253             c->idct_add= ff_simple_idct_add;
4254             c->idct    = ff_simple_idct;
4255             c->idct_permutation_type= FF_NO_IDCT_PERM;
4256         }
4257     }
4258
4259     c->get_pixels = get_pixels_c;
4260     c->diff_pixels = diff_pixels_c;
4261     c->put_pixels_clamped = put_pixels_clamped_c;
4262     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4263     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4264     c->add_pixels_clamped = add_pixels_clamped_c;
4265     c->add_pixels8 = add_pixels8_c;
4266     c->add_pixels4 = add_pixels4_c;
4267     c->sum_abs_dctelem = sum_abs_dctelem_c;
4268     c->gmc1 = gmc1_c;
4269     c->gmc = ff_gmc_c;
4270     c->clear_block = clear_block_c;
4271     c->clear_blocks = clear_blocks_c;
4272     c->pix_sum = pix_sum_c;
4273     c->pix_norm1 = pix_norm1_c;
4274
4275     c->fill_block_tab[0] = fill_block16_c;
4276     c->fill_block_tab[1] = fill_block8_c;
4277     c->scale_block = scale_block_c;
4278
4279     /* TODO [0] 16  [1] 8 */
4280     c->pix_abs[0][0] = pix_abs16_c;
4281     c->pix_abs[0][1] = pix_abs16_x2_c;
4282     c->pix_abs[0][2] = pix_abs16_y2_c;
4283     c->pix_abs[0][3] = pix_abs16_xy2_c;
4284     c->pix_abs[1][0] = pix_abs8_c;
4285     c->pix_abs[1][1] = pix_abs8_x2_c;
4286     c->pix_abs[1][2] = pix_abs8_y2_c;
4287     c->pix_abs[1][3] = pix_abs8_xy2_c;
4288
4289 #define dspfunc(PFX, IDX, NUM) \
4290     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
4291     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
4292     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
4293     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4294
4295     dspfunc(put, 0, 16);
4296     dspfunc(put_no_rnd, 0, 16);
4297     dspfunc(put, 1, 8);
4298     dspfunc(put_no_rnd, 1, 8);
4299     dspfunc(put, 2, 4);
4300     dspfunc(put, 3, 2);
4301
4302     dspfunc(avg, 0, 16);
4303     dspfunc(avg_no_rnd, 0, 16);
4304     dspfunc(avg, 1, 8);
4305     dspfunc(avg_no_rnd, 1, 8);
4306     dspfunc(avg, 2, 4);
4307     dspfunc(avg, 3, 2);
4308 #undef dspfunc
4309
4310     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4311     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4312
4313     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4314     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4315     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4316     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4317     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4318     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4319     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4320     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4321     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4322
4323     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4324     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4325     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4326     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4327     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4328     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4329     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4330     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4331     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4332
4333 #define dspfunc(PFX, IDX, NUM) \
4334     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4335     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4336     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4337     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4338     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4339     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4340     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4341     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4342     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4343     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4344     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4345     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4346     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4347     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4348     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4349     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4350
4351     dspfunc(put_qpel, 0, 16);
4352     dspfunc(put_no_rnd_qpel, 0, 16);
4353
4354     dspfunc(avg_qpel, 0, 16);
4355     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4356
4357     dspfunc(put_qpel, 1, 8);
4358     dspfunc(put_no_rnd_qpel, 1, 8);
4359
4360     dspfunc(avg_qpel, 1, 8);
4361     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4362
4363     dspfunc(put_h264_qpel, 0, 16);
4364     dspfunc(put_h264_qpel, 1, 8);
4365     dspfunc(put_h264_qpel, 2, 4);
4366     dspfunc(put_h264_qpel, 3, 2);
4367     dspfunc(avg_h264_qpel, 0, 16);
4368     dspfunc(avg_h264_qpel, 1, 8);
4369     dspfunc(avg_h264_qpel, 2, 4);
4370
4371 #undef dspfunc
4372     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4373     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4374     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4375     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4376     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4377     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4378     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4379     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4380
4381     c->draw_edges = draw_edges_c;
4382
4383 #if CONFIG_CAVS_DECODER
4384     ff_cavsdsp_init(c,avctx);
4385 #endif
4386
4387 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4388     ff_mlp_init(c, avctx);
4389 #endif
4390 #if CONFIG_VC1_DECODER
4391     ff_vc1dsp_init(c,avctx);
4392 #endif
4393 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4394     ff_intrax8dsp_init(c,avctx);
4395 #endif
4396 #if CONFIG_RV30_DECODER
4397     ff_rv30dsp_init(c,avctx);
4398 #endif
4399 #if CONFIG_RV40_DECODER
4400     ff_rv40dsp_init(c,avctx);
4401     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4402     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4403     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4404     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4405 #endif
4406
4407     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4408     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4409     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4410     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4411     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4412     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4413     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4414     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4415
4416 #define SET_CMP_FUNC(name) \
4417     c->name[0]= name ## 16_c;\
4418     c->name[1]= name ## 8x8_c;
4419
4420     SET_CMP_FUNC(hadamard8_diff)
4421     c->hadamard8_diff[4]= hadamard8_intra16_c;
4422     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4423     SET_CMP_FUNC(dct_sad)
4424     SET_CMP_FUNC(dct_max)
4425 #if CONFIG_GPL
4426     SET_CMP_FUNC(dct264_sad)
4427 #endif
4428     c->sad[0]= pix_abs16_c;
4429     c->sad[1]= pix_abs8_c;
4430     c->sse[0]= sse16_c;
4431     c->sse[1]= sse8_c;
4432     c->sse[2]= sse4_c;
4433     SET_CMP_FUNC(quant_psnr)
4434     SET_CMP_FUNC(rd)
4435     SET_CMP_FUNC(bit)
4436     c->vsad[0]= vsad16_c;
4437     c->vsad[4]= vsad_intra16_c;
4438     c->vsad[5]= vsad_intra8_c;
4439     c->vsse[0]= vsse16_c;
4440     c->vsse[4]= vsse_intra16_c;
4441     c->vsse[5]= vsse_intra8_c;
4442     c->nsse[0]= nsse16_c;
4443     c->nsse[1]= nsse8_c;
4444 #if CONFIG_DWT
4445     ff_dsputil_init_dwt(c);
4446 #endif
4447
4448     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4449
4450     c->add_bytes= add_bytes_c;
4451     c->add_bytes_l2= add_bytes_l2_c;
4452     c->diff_bytes= diff_bytes_c;
4453     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4454     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4455     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
4456     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4457     c->bswap_buf= bswap_buf;
4458 #if CONFIG_PNG_DECODER
4459     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4460 #endif
4461
4462     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4463         c->h263_h_loop_filter= h263_h_loop_filter_c;
4464         c->h263_v_loop_filter= h263_v_loop_filter_c;
4465     }
4466
4467     if (CONFIG_VP3_DECODER) {
4468         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4469         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4470         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4471     }
4472     if (CONFIG_VP6_DECODER) {
4473         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4474     }
4475
4476     c->h261_loop_filter= h261_loop_filter_c;
4477
4478     c->try_8x8basis= try_8x8basis_c;
4479     c->add_8x8basis= add_8x8basis_c;
4480
4481 #if CONFIG_VORBIS_DECODER
4482     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4483 #endif
4484 #if CONFIG_AC3_DECODER
4485     c->ac3_downmix = ff_ac3_downmix_c;
4486 #endif
4487 #if CONFIG_LPC
4488     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4489 #endif
4490     c->vector_fmul = vector_fmul_c;
4491     c->vector_fmul_reverse = vector_fmul_reverse_c;
4492     c->vector_fmul_add = vector_fmul_add_c;
4493     c->vector_fmul_window = ff_vector_fmul_window_c;
4494     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4495     c->vector_clipf = vector_clipf_c;
4496     c->float_to_int16 = ff_float_to_int16_c;
4497     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4498     c->scalarproduct_int16 = scalarproduct_int16_c;
4499     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4500     c->scalarproduct_float = scalarproduct_float_c;
4501     c->butterflies_float = butterflies_float_c;
4502     c->vector_fmul_scalar = vector_fmul_scalar_c;
4503
4504     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4505     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4506
4507     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4508     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4509
4510     c->shrink[0]= ff_img_copy_plane;
4511     c->shrink[1]= ff_shrink22;
4512     c->shrink[2]= ff_shrink44;
4513     c->shrink[3]= ff_shrink88;
4514
4515     c->prefetch= just_return;
4516
4517     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4518     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4519
4520     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
4521     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
4522     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
4523     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
4524     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
4525     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
4526     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
4527     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
4528     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
4529
4530     for(i=0; i<64; i++){
4531         if(!c->put_2tap_qpel_pixels_tab[0][i])
4532             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4533         if(!c->avg_2tap_qpel_pixels_tab[0][i])
4534             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4535     }
4536
4537     switch(c->idct_permutation_type){
4538     case FF_NO_IDCT_PERM:
4539         for(i=0; i<64; i++)
4540             c->idct_permutation[i]= i;
4541         break;
4542     case FF_LIBMPEG2_IDCT_PERM:
4543         for(i=0; i<64; i++)
4544             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4545         break;
4546     case FF_SIMPLE_IDCT_PERM:
4547         for(i=0; i<64; i++)
4548             c->idct_permutation[i]= simple_mmx_permutation[i];
4549         break;
4550     case FF_TRANSPOSE_IDCT_PERM:
4551         for(i=0; i<64; i++)
4552             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4553         break;
4554     case FF_PARTTRANS_IDCT_PERM:
4555         for(i=0; i<64; i++)
4556             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4557         break;
4558     case FF_SSE2_IDCT_PERM:
4559         for(i=0; i<64; i++)
4560             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4561         break;
4562     default:
4563         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
4564     }
4565 }
4566