Merge branch 'mirror' into vdpau
[FFMpeg-mirror/ffmpeg-vdpau.git] / libavcodec / dsputil.c
blob9a73e74df418c8ee6d0712d5c651610be0c778a9
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "h263.h"
36 #include "snow.h"
38 /* snow.c */
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 /* vorbis.c */
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 /* ac3dec.c */
45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47 /* flacenc.c */
48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50 /* pngdec.c */
51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53 /* eaidct.c */
54 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
57 uint32_t ff_squareTbl[512] = {0, };
59 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
60 #define pb_7f (~0UL/255 * 0x7f)
61 #define pb_80 (~0UL/255 * 0x80)
63 const uint8_t ff_zigzag_direct[64] = {
64 0, 1, 8, 16, 9, 2, 3, 10,
65 17, 24, 32, 25, 18, 11, 4, 5,
66 12, 19, 26, 33, 40, 48, 41, 34,
67 27, 20, 13, 6, 7, 14, 21, 28,
68 35, 42, 49, 56, 57, 50, 43, 36,
69 29, 22, 15, 23, 30, 37, 44, 51,
70 58, 59, 52, 45, 38, 31, 39, 46,
71 53, 60, 61, 54, 47, 55, 62, 63
74 /* Specific zigzag scan for 248 idct. NOTE that unlike the
75 specification, we interleave the fields */
76 const uint8_t ff_zigzag248_direct[64] = {
77 0, 8, 1, 9, 16, 24, 2, 10,
78 17, 25, 32, 40, 48, 56, 33, 41,
79 18, 26, 3, 11, 4, 12, 19, 27,
80 34, 42, 49, 57, 50, 58, 35, 43,
81 20, 28, 5, 13, 6, 14, 21, 29,
82 36, 44, 51, 59, 52, 60, 37, 45,
83 22, 30, 7, 15, 23, 31, 38, 46,
84 53, 61, 54, 62, 39, 47, 55, 63,
87 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
88 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90 const uint8_t ff_alternate_horizontal_scan[64] = {
91 0, 1, 2, 3, 8, 9, 16, 17,
92 10, 11, 4, 5, 6, 7, 15, 14,
93 13, 12, 19, 18, 24, 25, 32, 33,
94 26, 27, 20, 21, 22, 23, 28, 29,
95 30, 31, 34, 35, 40, 41, 48, 49,
96 42, 43, 36, 37, 38, 39, 44, 45,
97 46, 47, 50, 51, 56, 57, 58, 59,
98 52, 53, 54, 55, 60, 61, 62, 63,
101 const uint8_t ff_alternate_vertical_scan[64] = {
102 0, 8, 16, 24, 1, 9, 2, 10,
103 17, 25, 32, 40, 48, 56, 57, 49,
104 41, 33, 26, 18, 3, 11, 4, 12,
105 19, 27, 34, 42, 50, 58, 35, 43,
106 51, 59, 20, 28, 5, 13, 6, 14,
107 21, 29, 36, 44, 52, 60, 37, 45,
108 53, 61, 22, 30, 7, 15, 23, 31,
109 38, 46, 54, 62, 39, 47, 55, 63,
112 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
113 const uint32_t ff_inverse[256]={
114 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
115 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
116 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
117 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
118 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
119 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
120 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
121 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
122 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
123 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
124 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
125 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
126 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
127 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
128 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
129 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
130 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
131 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
132 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
133 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
134 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
135 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
136 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
137 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
138 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
139 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
140 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
141 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
142 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
143 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
144 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
145 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
148 /* Input permutation for the simple_idct_mmx */
149 static const uint8_t simple_mmx_permutation[64]={
150 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
160 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
163 int i;
164 int end;
166 st->scantable= src_scantable;
168 for(i=0; i<64; i++){
169 int j;
170 j = src_scantable[i];
171 st->permutated[i] = permutation[j];
172 #ifdef ARCH_POWERPC
173 st->inverse[j] = i;
174 #endif
177 end=-1;
178 for(i=0; i<64; i++){
179 int j;
180 j = st->permutated[i];
181 if(j>end) end=j;
182 st->raster_end[i]= end;
186 static int pix_sum_c(uint8_t * pix, int line_size)
188 int s, i, j;
190 s = 0;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
193 s += pix[0];
194 s += pix[1];
195 s += pix[2];
196 s += pix[3];
197 s += pix[4];
198 s += pix[5];
199 s += pix[6];
200 s += pix[7];
201 pix += 8;
203 pix += line_size - 16;
205 return s;
208 static int pix_norm1_c(uint8_t * pix, int line_size)
210 int s, i, j;
211 uint32_t *sq = ff_squareTbl + 256;
213 s = 0;
214 for (i = 0; i < 16; i++) {
215 for (j = 0; j < 16; j += 8) {
216 #if 0
217 s += sq[pix[0]];
218 s += sq[pix[1]];
219 s += sq[pix[2]];
220 s += sq[pix[3]];
221 s += sq[pix[4]];
222 s += sq[pix[5]];
223 s += sq[pix[6]];
224 s += sq[pix[7]];
225 #else
226 #if LONG_MAX > 2147483647
227 register uint64_t x=*(uint64_t*)pix;
228 s += sq[x&0xff];
229 s += sq[(x>>8)&0xff];
230 s += sq[(x>>16)&0xff];
231 s += sq[(x>>24)&0xff];
232 s += sq[(x>>32)&0xff];
233 s += sq[(x>>40)&0xff];
234 s += sq[(x>>48)&0xff];
235 s += sq[(x>>56)&0xff];
236 #else
237 register uint32_t x=*(uint32_t*)pix;
238 s += sq[x&0xff];
239 s += sq[(x>>8)&0xff];
240 s += sq[(x>>16)&0xff];
241 s += sq[(x>>24)&0xff];
242 x=*(uint32_t*)(pix+4);
243 s += sq[x&0xff];
244 s += sq[(x>>8)&0xff];
245 s += sq[(x>>16)&0xff];
246 s += sq[(x>>24)&0xff];
247 #endif
248 #endif
249 pix += 8;
251 pix += line_size - 16;
253 return s;
256 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
257 int i;
259 for(i=0; i+8<=w; i+=8){
260 dst[i+0]= bswap_32(src[i+0]);
261 dst[i+1]= bswap_32(src[i+1]);
262 dst[i+2]= bswap_32(src[i+2]);
263 dst[i+3]= bswap_32(src[i+3]);
264 dst[i+4]= bswap_32(src[i+4]);
265 dst[i+5]= bswap_32(src[i+5]);
266 dst[i+6]= bswap_32(src[i+6]);
267 dst[i+7]= bswap_32(src[i+7]);
269 for(;i<w; i++){
270 dst[i+0]= bswap_32(src[i+0]);
274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276 int s, i;
277 uint32_t *sq = ff_squareTbl + 256;
279 s = 0;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
285 pix1 += line_size;
286 pix2 += line_size;
288 return s;
291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293 int s, i;
294 uint32_t *sq = ff_squareTbl + 256;
296 s = 0;
297 for (i = 0; i < h; i++) {
298 s += sq[pix1[0] - pix2[0]];
299 s += sq[pix1[1] - pix2[1]];
300 s += sq[pix1[2] - pix2[2]];
301 s += sq[pix1[3] - pix2[3]];
302 s += sq[pix1[4] - pix2[4]];
303 s += sq[pix1[5] - pix2[5]];
304 s += sq[pix1[6] - pix2[6]];
305 s += sq[pix1[7] - pix2[7]];
306 pix1 += line_size;
307 pix2 += line_size;
309 return s;
312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314 int s, i;
315 uint32_t *sq = ff_squareTbl + 256;
317 s = 0;
318 for (i = 0; i < h; i++) {
319 s += sq[pix1[ 0] - pix2[ 0]];
320 s += sq[pix1[ 1] - pix2[ 1]];
321 s += sq[pix1[ 2] - pix2[ 2]];
322 s += sq[pix1[ 3] - pix2[ 3]];
323 s += sq[pix1[ 4] - pix2[ 4]];
324 s += sq[pix1[ 5] - pix2[ 5]];
325 s += sq[pix1[ 6] - pix2[ 6]];
326 s += sq[pix1[ 7] - pix2[ 7]];
327 s += sq[pix1[ 8] - pix2[ 8]];
328 s += sq[pix1[ 9] - pix2[ 9]];
329 s += sq[pix1[10] - pix2[10]];
330 s += sq[pix1[11] - pix2[11]];
331 s += sq[pix1[12] - pix2[12]];
332 s += sq[pix1[13] - pix2[13]];
333 s += sq[pix1[14] - pix2[14]];
334 s += sq[pix1[15] - pix2[15]];
336 pix1 += line_size;
337 pix2 += line_size;
339 return s;
343 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
344 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
345 int s, i, j;
346 const int dec_count= w==8 ? 3 : 4;
347 int tmp[32*32];
348 int level, ori;
349 static const int scale[2][2][4][4]={
352 // 9/7 8x8 dec=3
353 {268, 239, 239, 213},
354 { 0, 224, 224, 152},
355 { 0, 135, 135, 110},
357 // 9/7 16x16 or 32x32 dec=4
358 {344, 310, 310, 280},
359 { 0, 320, 320, 228},
360 { 0, 175, 175, 136},
361 { 0, 129, 129, 102},
365 // 5/3 8x8 dec=3
366 {275, 245, 245, 218},
367 { 0, 230, 230, 156},
368 { 0, 138, 138, 113},
370 // 5/3 16x16 or 32x32 dec=4
371 {352, 317, 317, 286},
372 { 0, 328, 328, 233},
373 { 0, 180, 180, 140},
374 { 0, 132, 132, 105},
379 for (i = 0; i < h; i++) {
380 for (j = 0; j < w; j+=4) {
381 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
382 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
383 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
384 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386 pix1 += line_size;
387 pix2 += line_size;
390 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392 s=0;
393 assert(w==h);
394 for(level=0; level<dec_count; level++){
395 for(ori= level ? 1 : 0; ori<4; ori++){
396 int size= w>>(dec_count-level);
397 int sx= (ori&1) ? size : 0;
398 int stride= 32<<(dec_count-level);
399 int sy= (ori&2) ? stride>>1 : 0;
401 for(i=0; i<size; i++){
402 for(j=0; j<size; j++){
403 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
404 s += FFABS(v);
409 assert(s>=0);
410 return s>>9;
413 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414 return w_c(v, pix1, pix2, line_size, 8, h, 1);
417 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 0);
421 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 16, h, 1);
425 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 0);
429 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 32, h, 1);
433 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434 return w_c(v, pix1, pix2, line_size, 32, h, 0);
436 #endif
438 /* draw the edges of width 'w' of an image of size width, height */
439 //FIXME check that this is ok for mpeg4 interlaced
440 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442 uint8_t *ptr, *last_line;
443 int i;
445 last_line = buf + (height - 1) * wrap;
446 for(i=0;i<w;i++) {
447 /* top and bottom */
448 memcpy(buf - (i + 1) * wrap, buf, width);
449 memcpy(last_line + (i + 1) * wrap, last_line, width);
451 /* left and right */
452 ptr = buf;
453 for(i=0;i<height;i++) {
454 memset(ptr - w, ptr[0], w);
455 memset(ptr + width, ptr[width-1], w);
456 ptr += wrap;
458 /* corners */
459 for(i=0;i<w;i++) {
460 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
461 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
462 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
463 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
469 * @param buf destination buffer
470 * @param src source buffer
471 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
472 * @param block_w width of block
473 * @param block_h height of block
474 * @param src_x x coordinate of the top left sample of the block in the source buffer
475 * @param src_y y coordinate of the top left sample of the block in the source buffer
476 * @param w width of the source buffer
477 * @param h height of the source buffer
479 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
480 int src_x, int src_y, int w, int h){
481 int x, y;
482 int start_y, start_x, end_y, end_x;
484 if(src_y>= h){
485 src+= (h-1-src_y)*linesize;
486 src_y=h-1;
487 }else if(src_y<=-block_h){
488 src+= (1-block_h-src_y)*linesize;
489 src_y=1-block_h;
491 if(src_x>= w){
492 src+= (w-1-src_x);
493 src_x=w-1;
494 }else if(src_x<=-block_w){
495 src+= (1-block_w-src_x);
496 src_x=1-block_w;
499 start_y= FFMAX(0, -src_y);
500 start_x= FFMAX(0, -src_x);
501 end_y= FFMIN(block_h, h-src_y);
502 end_x= FFMIN(block_w, w-src_x);
504 // copy existing part
505 for(y=start_y; y<end_y; y++){
506 for(x=start_x; x<end_x; x++){
507 buf[x + y*linesize]= src[x + y*linesize];
511 //top
512 for(y=0; y<start_y; y++){
513 for(x=start_x; x<end_x; x++){
514 buf[x + y*linesize]= buf[x + start_y*linesize];
518 //bottom
519 for(y=end_y; y<block_h; y++){
520 for(x=start_x; x<end_x; x++){
521 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
525 for(y=0; y<block_h; y++){
526 //left
527 for(x=0; x<start_x; x++){
528 buf[x + y*linesize]= buf[start_x + y*linesize];
531 //right
532 for(x=end_x; x<block_w; x++){
533 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540 int i;
542 /* read the pixels */
543 for(i=0;i<8;i++) {
544 block[0] = pixels[0];
545 block[1] = pixels[1];
546 block[2] = pixels[2];
547 block[3] = pixels[3];
548 block[4] = pixels[4];
549 block[5] = pixels[5];
550 block[6] = pixels[6];
551 block[7] = pixels[7];
552 pixels += line_size;
553 block += 8;
557 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
558 const uint8_t *s2, int stride){
559 int i;
561 /* read the pixels */
562 for(i=0;i<8;i++) {
563 block[0] = s1[0] - s2[0];
564 block[1] = s1[1] - s2[1];
565 block[2] = s1[2] - s2[2];
566 block[3] = s1[3] - s2[3];
567 block[4] = s1[4] - s2[4];
568 block[5] = s1[5] - s2[5];
569 block[6] = s1[6] - s2[6];
570 block[7] = s1[7] - s2[7];
571 s1 += stride;
572 s2 += stride;
573 block += 8;
578 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
579 int line_size)
581 int i;
582 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584 /* read the pixels */
585 for(i=0;i<8;i++) {
586 pixels[0] = cm[block[0]];
587 pixels[1] = cm[block[1]];
588 pixels[2] = cm[block[2]];
589 pixels[3] = cm[block[3]];
590 pixels[4] = cm[block[4]];
591 pixels[5] = cm[block[5]];
592 pixels[6] = cm[block[6]];
593 pixels[7] = cm[block[7]];
595 pixels += line_size;
596 block += 8;
600 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
601 int line_size)
603 int i;
604 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606 /* read the pixels */
607 for(i=0;i<4;i++) {
608 pixels[0] = cm[block[0]];
609 pixels[1] = cm[block[1]];
610 pixels[2] = cm[block[2]];
611 pixels[3] = cm[block[3]];
613 pixels += line_size;
614 block += 8;
618 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
619 int line_size)
621 int i;
622 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624 /* read the pixels */
625 for(i=0;i<2;i++) {
626 pixels[0] = cm[block[0]];
627 pixels[1] = cm[block[1]];
629 pixels += line_size;
630 block += 8;
634 static void put_signed_pixels_clamped_c(const DCTELEM *block,
635 uint8_t *restrict pixels,
636 int line_size)
638 int i, j;
640 for (i = 0; i < 8; i++) {
641 for (j = 0; j < 8; j++) {
642 if (*block < -128)
643 *pixels = 0;
644 else if (*block > 127)
645 *pixels = 255;
646 else
647 *pixels = (uint8_t)(*block + 128);
648 block++;
649 pixels++;
651 pixels += (line_size - 8);
655 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
656 int line_size)
658 int i;
659 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661 /* read the pixels */
662 for(i=0;i<8;i++) {
663 pixels[0] = cm[pixels[0] + block[0]];
664 pixels[1] = cm[pixels[1] + block[1]];
665 pixels[2] = cm[pixels[2] + block[2]];
666 pixels[3] = cm[pixels[3] + block[3]];
667 pixels[4] = cm[pixels[4] + block[4]];
668 pixels[5] = cm[pixels[5] + block[5]];
669 pixels[6] = cm[pixels[6] + block[6]];
670 pixels[7] = cm[pixels[7] + block[7]];
671 pixels += line_size;
672 block += 8;
676 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
677 int line_size)
679 int i;
680 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682 /* read the pixels */
683 for(i=0;i<4;i++) {
684 pixels[0] = cm[pixels[0] + block[0]];
685 pixels[1] = cm[pixels[1] + block[1]];
686 pixels[2] = cm[pixels[2] + block[2]];
687 pixels[3] = cm[pixels[3] + block[3]];
688 pixels += line_size;
689 block += 8;
693 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
694 int line_size)
696 int i;
697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699 /* read the pixels */
700 for(i=0;i<2;i++) {
701 pixels[0] = cm[pixels[0] + block[0]];
702 pixels[1] = cm[pixels[1] + block[1]];
703 pixels += line_size;
704 block += 8;
708 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710 int i;
711 for(i=0;i<8;i++) {
712 pixels[0] += block[0];
713 pixels[1] += block[1];
714 pixels[2] += block[2];
715 pixels[3] += block[3];
716 pixels[4] += block[4];
717 pixels[5] += block[5];
718 pixels[6] += block[6];
719 pixels[7] += block[7];
720 pixels += line_size;
721 block += 8;
725 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727 int i;
728 for(i=0;i<4;i++) {
729 pixels[0] += block[0];
730 pixels[1] += block[1];
731 pixels[2] += block[2];
732 pixels[3] += block[3];
733 pixels += line_size;
734 block += 4;
738 static int sum_abs_dctelem_c(DCTELEM *block)
740 int sum=0, i;
741 for(i=0; i<64; i++)
742 sum+= FFABS(block[i]);
743 return sum;
746 #if 0
748 #define PIXOP2(OPNAME, OP) \
749 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751 int i;\
752 for(i=0; i<h; i++){\
753 OP(*((uint64_t*)block), AV_RN64(pixels));\
754 pixels+=line_size;\
755 block +=line_size;\
759 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761 int i;\
762 for(i=0; i<h; i++){\
763 const uint64_t a= AV_RN64(pixels );\
764 const uint64_t b= AV_RN64(pixels+1);\
765 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
766 pixels+=line_size;\
767 block +=line_size;\
771 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773 int i;\
774 for(i=0; i<h; i++){\
775 const uint64_t a= AV_RN64(pixels );\
776 const uint64_t b= AV_RN64(pixels+1);\
777 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
778 pixels+=line_size;\
779 block +=line_size;\
783 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785 int i;\
786 for(i=0; i<h; i++){\
787 const uint64_t a= AV_RN64(pixels );\
788 const uint64_t b= AV_RN64(pixels+line_size);\
789 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
790 pixels+=line_size;\
791 block +=line_size;\
795 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797 int i;\
798 for(i=0; i<h; i++){\
799 const uint64_t a= AV_RN64(pixels );\
800 const uint64_t b= AV_RN64(pixels+line_size);\
801 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
802 pixels+=line_size;\
803 block +=line_size;\
807 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809 int i;\
810 const uint64_t a= AV_RN64(pixels );\
811 const uint64_t b= AV_RN64(pixels+1);\
812 uint64_t l0= (a&0x0303030303030303ULL)\
813 + (b&0x0303030303030303ULL)\
814 + 0x0202020202020202ULL;\
815 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
816 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
817 uint64_t l1,h1;\
819 pixels+=line_size;\
820 for(i=0; i<h; i+=2){\
821 uint64_t a= AV_RN64(pixels );\
822 uint64_t b= AV_RN64(pixels+1);\
823 l1= (a&0x0303030303030303ULL)\
824 + (b&0x0303030303030303ULL);\
825 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
826 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
827 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 a= AV_RN64(pixels );\
831 b= AV_RN64(pixels+1);\
832 l0= (a&0x0303030303030303ULL)\
833 + (b&0x0303030303030303ULL)\
834 + 0x0202020202020202ULL;\
835 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
836 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
837 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
838 pixels+=line_size;\
839 block +=line_size;\
843 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845 int i;\
846 const uint64_t a= AV_RN64(pixels );\
847 const uint64_t b= AV_RN64(pixels+1);\
848 uint64_t l0= (a&0x0303030303030303ULL)\
849 + (b&0x0303030303030303ULL)\
850 + 0x0101010101010101ULL;\
851 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
852 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
853 uint64_t l1,h1;\
855 pixels+=line_size;\
856 for(i=0; i<h; i+=2){\
857 uint64_t a= AV_RN64(pixels );\
858 uint64_t b= AV_RN64(pixels+1);\
859 l1= (a&0x0303030303030303ULL)\
860 + (b&0x0303030303030303ULL);\
861 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
862 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
863 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
864 pixels+=line_size;\
865 block +=line_size;\
866 a= AV_RN64(pixels );\
867 b= AV_RN64(pixels+1);\
868 l0= (a&0x0303030303030303ULL)\
869 + (b&0x0303030303030303ULL)\
870 + 0x0101010101010101ULL;\
871 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
872 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
873 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
874 pixels+=line_size;\
875 block +=line_size;\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
888 #else // 64 bit variant
890 #define PIXOP2(OPNAME, OP) \
891 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 int i;\
893 for(i=0; i<h; i++){\
894 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
895 pixels+=line_size;\
896 block +=line_size;\
899 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 int i;\
901 for(i=0; i<h; i++){\
902 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
903 pixels+=line_size;\
904 block +=line_size;\
907 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 int i;\
909 for(i=0; i<h; i++){\
910 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
911 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
912 pixels+=line_size;\
913 block +=line_size;\
916 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
920 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921 int src_stride1, int src_stride2, int h){\
922 int i;\
923 for(i=0; i<h; i++){\
924 uint32_t a,b;\
925 a= AV_RN32(&src1[i*src_stride1 ]);\
926 b= AV_RN32(&src2[i*src_stride2 ]);\
927 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
928 a= AV_RN32(&src1[i*src_stride1+4]);\
929 b= AV_RN32(&src2[i*src_stride2+4]);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
934 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
935 int src_stride1, int src_stride2, int h){\
936 int i;\
937 for(i=0; i<h; i++){\
938 uint32_t a,b;\
939 a= AV_RN32(&src1[i*src_stride1 ]);\
940 b= AV_RN32(&src2[i*src_stride2 ]);\
941 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
942 a= AV_RN32(&src1[i*src_stride1+4]);\
943 b= AV_RN32(&src2[i*src_stride2+4]);\
944 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
948 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
949 int src_stride1, int src_stride2, int h){\
950 int i;\
951 for(i=0; i<h; i++){\
952 uint32_t a,b;\
953 a= AV_RN32(&src1[i*src_stride1 ]);\
954 b= AV_RN32(&src2[i*src_stride2 ]);\
955 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
959 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
960 int src_stride1, int src_stride2, int h){\
961 int i;\
962 for(i=0; i<h; i++){\
963 uint32_t a,b;\
964 a= AV_RN16(&src1[i*src_stride1 ]);\
965 b= AV_RN16(&src2[i*src_stride2 ]);\
966 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
970 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971 int src_stride1, int src_stride2, int h){\
972 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
973 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
976 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
977 int src_stride1, int src_stride2, int h){\
978 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
979 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
982 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
994 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000 int i;\
1001 for(i=0; i<h; i++){\
1002 uint32_t a, b, c, d, l0, l1, h0, h1;\
1003 a= AV_RN32(&src1[i*src_stride1]);\
1004 b= AV_RN32(&src2[i*src_stride2]);\
1005 c= AV_RN32(&src3[i*src_stride3]);\
1006 d= AV_RN32(&src4[i*src_stride4]);\
1007 l0= (a&0x03030303UL)\
1008 + (b&0x03030303UL)\
1009 + 0x02020202UL;\
1010 h0= ((a&0xFCFCFCFCUL)>>2)\
1011 + ((b&0xFCFCFCFCUL)>>2);\
1012 l1= (c&0x03030303UL)\
1013 + (d&0x03030303UL);\
1014 h1= ((c&0xFCFCFCFCUL)>>2)\
1015 + ((d&0xFCFCFCFCUL)>>2);\
1016 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017 a= AV_RN32(&src1[i*src_stride1+4]);\
1018 b= AV_RN32(&src2[i*src_stride2+4]);\
1019 c= AV_RN32(&src3[i*src_stride3+4]);\
1020 d= AV_RN32(&src4[i*src_stride4+4]);\
1021 l0= (a&0x03030303UL)\
1022 + (b&0x03030303UL)\
1023 + 0x02020202UL;\
1024 h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1026 l1= (c&0x03030303UL)\
1027 + (d&0x03030303UL);\
1028 h1= ((c&0xFCFCFCFCUL)>>2)\
1029 + ((d&0xFCFCFCFCUL)>>2);\
1030 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1046 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1050 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052 int i;\
1053 for(i=0; i<h; i++){\
1054 uint32_t a, b, c, d, l0, l1, h0, h1;\
1055 a= AV_RN32(&src1[i*src_stride1]);\
1056 b= AV_RN32(&src2[i*src_stride2]);\
1057 c= AV_RN32(&src3[i*src_stride3]);\
1058 d= AV_RN32(&src4[i*src_stride4]);\
1059 l0= (a&0x03030303UL)\
1060 + (b&0x03030303UL)\
1061 + 0x01010101UL;\
1062 h0= ((a&0xFCFCFCFCUL)>>2)\
1063 + ((b&0xFCFCFCFCUL)>>2);\
1064 l1= (c&0x03030303UL)\
1065 + (d&0x03030303UL);\
1066 h1= ((c&0xFCFCFCFCUL)>>2)\
1067 + ((d&0xFCFCFCFCUL)>>2);\
1068 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069 a= AV_RN32(&src1[i*src_stride1+4]);\
1070 b= AV_RN32(&src2[i*src_stride2+4]);\
1071 c= AV_RN32(&src3[i*src_stride3+4]);\
1072 d= AV_RN32(&src4[i*src_stride4+4]);\
1073 l0= (a&0x03030303UL)\
1074 + (b&0x03030303UL)\
1075 + 0x01010101UL;\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 l1= (c&0x03030303UL)\
1079 + (d&0x03030303UL);\
1080 h1= ((c&0xFCFCFCFCUL)>>2)\
1081 + ((d&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1085 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1096 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 int i, a0, b0, a1, b1;\
1099 a0= pixels[0];\
1100 b0= pixels[1] + 2;\
1101 a0 += b0;\
1102 b0 += pixels[2];\
1104 pixels+=line_size;\
1105 for(i=0; i<h; i+=2){\
1106 a1= pixels[0];\
1107 b1= pixels[1];\
1108 a1 += b1;\
1109 b1 += pixels[2];\
1111 block[0]= (a1+a0)>>2; /* FIXME non put */\
1112 block[1]= (b1+b0)>>2;\
1114 pixels+=line_size;\
1115 block +=line_size;\
1117 a0= pixels[0];\
1118 b0= pixels[1] + 2;\
1119 a0 += b0;\
1120 b0 += pixels[2];\
1122 block[0]= (a1+a0)>>2;\
1123 block[1]= (b1+b0)>>2;\
1124 pixels+=line_size;\
1125 block +=line_size;\
1129 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 int i;\
1132 const uint32_t a= AV_RN32(pixels );\
1133 const uint32_t b= AV_RN32(pixels+1);\
1134 uint32_t l0= (a&0x03030303UL)\
1135 + (b&0x03030303UL)\
1136 + 0x02020202UL;\
1137 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138 + ((b&0xFCFCFCFCUL)>>2);\
1139 uint32_t l1,h1;\
1141 pixels+=line_size;\
1142 for(i=0; i<h; i+=2){\
1143 uint32_t a= AV_RN32(pixels );\
1144 uint32_t b= AV_RN32(pixels+1);\
1145 l1= (a&0x03030303UL)\
1146 + (b&0x03030303UL);\
1147 h1= ((a&0xFCFCFCFCUL)>>2)\
1148 + ((b&0xFCFCFCFCUL)>>2);\
1149 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150 pixels+=line_size;\
1151 block +=line_size;\
1152 a= AV_RN32(pixels );\
1153 b= AV_RN32(pixels+1);\
1154 l0= (a&0x03030303UL)\
1155 + (b&0x03030303UL)\
1156 + 0x02020202UL;\
1157 h0= ((a&0xFCFCFCFCUL)>>2)\
1158 + ((b&0xFCFCFCFCUL)>>2);\
1159 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160 pixels+=line_size;\
1161 block +=line_size;\
1165 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 int j;\
1168 for(j=0; j<2; j++){\
1169 int i;\
1170 const uint32_t a= AV_RN32(pixels );\
1171 const uint32_t b= AV_RN32(pixels+1);\
1172 uint32_t l0= (a&0x03030303UL)\
1173 + (b&0x03030303UL)\
1174 + 0x02020202UL;\
1175 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176 + ((b&0xFCFCFCFCUL)>>2);\
1177 uint32_t l1,h1;\
1179 pixels+=line_size;\
1180 for(i=0; i<h; i+=2){\
1181 uint32_t a= AV_RN32(pixels );\
1182 uint32_t b= AV_RN32(pixels+1);\
1183 l1= (a&0x03030303UL)\
1184 + (b&0x03030303UL);\
1185 h1= ((a&0xFCFCFCFCUL)>>2)\
1186 + ((b&0xFCFCFCFCUL)>>2);\
1187 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188 pixels+=line_size;\
1189 block +=line_size;\
1190 a= AV_RN32(pixels );\
1191 b= AV_RN32(pixels+1);\
1192 l0= (a&0x03030303UL)\
1193 + (b&0x03030303UL)\
1194 + 0x02020202UL;\
1195 h0= ((a&0xFCFCFCFCUL)>>2)\
1196 + ((b&0xFCFCFCFCUL)>>2);\
1197 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198 pixels+=line_size;\
1199 block +=line_size;\
1201 pixels+=4-line_size*(h+1);\
1202 block +=4-line_size*h;\
1206 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 int j;\
1209 for(j=0; j<2; j++){\
1210 int i;\
1211 const uint32_t a= AV_RN32(pixels );\
1212 const uint32_t b= AV_RN32(pixels+1);\
1213 uint32_t l0= (a&0x03030303UL)\
1214 + (b&0x03030303UL)\
1215 + 0x01010101UL;\
1216 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217 + ((b&0xFCFCFCFCUL)>>2);\
1218 uint32_t l1,h1;\
1220 pixels+=line_size;\
1221 for(i=0; i<h; i+=2){\
1222 uint32_t a= AV_RN32(pixels );\
1223 uint32_t b= AV_RN32(pixels+1);\
1224 l1= (a&0x03030303UL)\
1225 + (b&0x03030303UL);\
1226 h1= ((a&0xFCFCFCFCUL)>>2)\
1227 + ((b&0xFCFCFCFCUL)>>2);\
1228 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229 pixels+=line_size;\
1230 block +=line_size;\
1231 a= AV_RN32(pixels );\
1232 b= AV_RN32(pixels+1);\
1233 l0= (a&0x03030303UL)\
1234 + (b&0x03030303UL)\
1235 + 0x01010101UL;\
1236 h0= ((a&0xFCFCFCFCUL)>>2)\
1237 + ((b&0xFCFCFCFCUL)>>2);\
1238 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239 pixels+=line_size;\
1240 block +=line_size;\
1242 pixels+=4-line_size*(h+1);\
1243 block +=4-line_size*h;\
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256 #define op_avg(a, b) a = rnd_avg32(a, b)
1257 #endif
1258 #define op_put(a, b) a = b
1260 PIXOP2(avg, op_avg)
1261 PIXOP2(put, op_put)
1262 #undef op_avg
1263 #undef op_put
1265 #define avg2(a,b) ((a+b+1)>>1)
1266 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1272 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1276 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 const int A=(16-x16)*(16-y16);
1279 const int B=( x16)*(16-y16);
1280 const int C=(16-x16)*( y16);
1281 const int D=( x16)*( y16);
1282 int i;
1284 for(i=0; i<h; i++)
1286 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294 dst+= stride;
1295 src+= stride;
1299 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 int y, vx, vy;
1303 const int s= 1<<shift;
1305 width--;
1306 height--;
1308 for(y=0; y<h; y++){
1309 int x;
1311 vx= ox;
1312 vy= oy;
1313 for(x=0; x<8; x++){ //XXX FIXME optimize
1314 int src_x, src_y, frac_x, frac_y, index;
1316 src_x= vx>>16;
1317 src_y= vy>>16;
1318 frac_x= src_x&(s-1);
1319 frac_y= src_y&(s-1);
1320 src_x>>=shift;
1321 src_y>>=shift;
1323 if((unsigned)src_x < width){
1324 if((unsigned)src_y < height){
1325 index= src_x + src_y*stride;
1326 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1327 + src[index +1]* frac_x )*(s-frac_y)
1328 + ( src[index+stride ]*(s-frac_x)
1329 + src[index+stride+1]* frac_x )* frac_y
1330 + r)>>(shift*2);
1331 }else{
1332 index= src_x + av_clip(src_y, 0, height)*stride;
1333 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1334 + src[index +1]* frac_x )*s
1335 + r)>>(shift*2);
1337 }else{
1338 if((unsigned)src_y < height){
1339 index= av_clip(src_x, 0, width) + src_y*stride;
1340 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1341 + src[index+stride ]* frac_y )*s
1342 + r)>>(shift*2);
1343 }else{
1344 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345 dst[y*stride + x]= src[index ];
1349 vx+= dxx;
1350 vy+= dyx;
1352 ox += dxy;
1353 oy += dyy;
1357 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 switch(width){
1359 case 2: put_pixels2_c (dst, src, stride, height); break;
1360 case 4: put_pixels4_c (dst, src, stride, height); break;
1361 case 8: put_pixels8_c (dst, src, stride, height); break;
1362 case 16:put_pixels16_c(dst, src, stride, height); break;
1366 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 int i,j;
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372 src += stride;
1373 dst += stride;
1377 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 int i,j;
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383 src += stride;
1384 dst += stride;
1388 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 int i,j;
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394 src += stride;
1395 dst += stride;
1399 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400 int i,j;
1401 for (i=0; i < height; i++) {
1402 for (j=0; j < width; j++) {
1403 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405 src += stride;
1406 dst += stride;
1410 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411 int i,j;
1412 for (i=0; i < height; i++) {
1413 for (j=0; j < width; j++) {
1414 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416 src += stride;
1417 dst += stride;
1421 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422 int i,j;
1423 for (i=0; i < height; i++) {
1424 for (j=0; j < width; j++) {
1425 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427 src += stride;
1428 dst += stride;
1432 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433 int i,j;
1434 for (i=0; i < height; i++) {
1435 for (j=0; j < width; j++) {
1436 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438 src += stride;
1439 dst += stride;
1443 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444 int i,j;
1445 for (i=0; i < height; i++) {
1446 for (j=0; j < width; j++) {
1447 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449 src += stride;
1450 dst += stride;
1454 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455 switch(width){
1456 case 2: avg_pixels2_c (dst, src, stride, height); break;
1457 case 4: avg_pixels4_c (dst, src, stride, height); break;
1458 case 8: avg_pixels8_c (dst, src, stride, height); break;
1459 case 16:avg_pixels16_c(dst, src, stride, height); break;
1463 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464 int i,j;
1465 for (i=0; i < height; i++) {
1466 for (j=0; j < width; j++) {
1467 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469 src += stride;
1470 dst += stride;
1474 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475 int i,j;
1476 for (i=0; i < height; i++) {
1477 for (j=0; j < width; j++) {
1478 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480 src += stride;
1481 dst += stride;
1485 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486 int i,j;
1487 for (i=0; i < height; i++) {
1488 for (j=0; j < width; j++) {
1489 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491 src += stride;
1492 dst += stride;
1496 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497 int i,j;
1498 for (i=0; i < height; i++) {
1499 for (j=0; j < width; j++) {
1500 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502 src += stride;
1503 dst += stride;
1507 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508 int i,j;
1509 for (i=0; i < height; i++) {
1510 for (j=0; j < width; j++) {
1511 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513 src += stride;
1514 dst += stride;
1518 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519 int i,j;
1520 for (i=0; i < height; i++) {
1521 for (j=0; j < width; j++) {
1522 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524 src += stride;
1525 dst += stride;
1529 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530 int i,j;
1531 for (i=0; i < height; i++) {
1532 for (j=0; j < width; j++) {
1533 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535 src += stride;
1536 dst += stride;
1540 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541 int i,j;
1542 for (i=0; i < height; i++) {
1543 for (j=0; j < width; j++) {
1544 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546 src += stride;
1547 dst += stride;
1550 #if 0
1551 #define TPEL_WIDTH(width)\
1552 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570 #endif
1572 #define H264_CHROMA_MC(OPNAME, OP)\
1573 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574 const int A=(8-x)*(8-y);\
1575 const int B=( x)*(8-y);\
1576 const int C=(8-x)*( y);\
1577 const int D=( x)*( y);\
1578 int i;\
1580 assert(x<8 && y<8 && x>=0 && y>=0);\
1582 if(D){\
1583 for(i=0; i<h; i++){\
1584 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586 dst+= stride;\
1587 src+= stride;\
1589 }else{\
1590 const int E= B+C;\
1591 const int step= C ? stride : 1;\
1592 for(i=0; i<h; i++){\
1593 OP(dst[0], (A*src[0] + E*src[step+0]));\
1594 OP(dst[1], (A*src[1] + E*src[step+1]));\
1595 dst+= stride;\
1596 src+= stride;\
1601 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602 const int A=(8-x)*(8-y);\
1603 const int B=( x)*(8-y);\
1604 const int C=(8-x)*( y);\
1605 const int D=( x)*( y);\
1606 int i;\
1608 assert(x<8 && y<8 && x>=0 && y>=0);\
1610 if(D){\
1611 for(i=0; i<h; i++){\
1612 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616 dst+= stride;\
1617 src+= stride;\
1619 }else{\
1620 const int E= B+C;\
1621 const int step= C ? stride : 1;\
1622 for(i=0; i<h; i++){\
1623 OP(dst[0], (A*src[0] + E*src[step+0]));\
1624 OP(dst[1], (A*src[1] + E*src[step+1]));\
1625 OP(dst[2], (A*src[2] + E*src[step+2]));\
1626 OP(dst[3], (A*src[3] + E*src[step+3]));\
1627 dst+= stride;\
1628 src+= stride;\
1633 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634 const int A=(8-x)*(8-y);\
1635 const int B=( x)*(8-y);\
1636 const int C=(8-x)*( y);\
1637 const int D=( x)*( y);\
1638 int i;\
1640 assert(x<8 && y<8 && x>=0 && y>=0);\
1642 if(D){\
1643 for(i=0; i<h; i++){\
1644 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652 dst+= stride;\
1653 src+= stride;\
1655 }else{\
1656 const int E= B+C;\
1657 const int step= C ? stride : 1;\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + E*src[step+0]));\
1660 OP(dst[1], (A*src[1] + E*src[step+1]));\
1661 OP(dst[2], (A*src[2] + E*src[step+2]));\
1662 OP(dst[3], (A*src[3] + E*src[step+3]));\
1663 OP(dst[4], (A*src[4] + E*src[step+4]));\
1664 OP(dst[5], (A*src[5] + E*src[step+5]));\
1665 OP(dst[6], (A*src[6] + E*src[step+6]));\
1666 OP(dst[7], (A*src[7] + E*src[step+7]));\
1667 dst+= stride;\
1668 src+= stride;\
1673 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674 #define op_put(a, b) a = (((b) + 32)>>6)
1676 H264_CHROMA_MC(put_ , op_put)
1677 H264_CHROMA_MC(avg_ , op_avg)
1678 #undef op_avg
1679 #undef op_put
1681 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682 const int A=(8-x)*(8-y);
1683 const int B=( x)*(8-y);
1684 const int C=(8-x)*( y);
1685 const int D=( x)*( y);
1686 int i;
1688 assert(x<8 && y<8 && x>=0 && y>=0);
1690 for(i=0; i<h; i++)
1692 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700 dst+= stride;
1701 src+= stride;
1705 #define QPEL_MC(r, OPNAME, RND, OP) \
1706 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708 int i;\
1709 for(i=0; i<h; i++)\
1711 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719 dst+=dstStride;\
1720 src+=srcStride;\
1724 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725 const int w=8;\
1726 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727 int i;\
1728 for(i=0; i<w; i++)\
1730 const int src0= src[0*srcStride];\
1731 const int src1= src[1*srcStride];\
1732 const int src2= src[2*srcStride];\
1733 const int src3= src[3*srcStride];\
1734 const int src4= src[4*srcStride];\
1735 const int src5= src[5*srcStride];\
1736 const int src6= src[6*srcStride];\
1737 const int src7= src[7*srcStride];\
1738 const int src8= src[8*srcStride];\
1739 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747 dst++;\
1748 src++;\
1752 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754 int i;\
1756 for(i=0; i<h; i++)\
1758 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774 dst+=dstStride;\
1775 src+=srcStride;\
1779 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781 int i;\
1782 const int w=16;\
1783 for(i=0; i<w; i++)\
1785 const int src0= src[0*srcStride];\
1786 const int src1= src[1*srcStride];\
1787 const int src2= src[2*srcStride];\
1788 const int src3= src[3*srcStride];\
1789 const int src4= src[4*srcStride];\
1790 const int src5= src[5*srcStride];\
1791 const int src6= src[6*srcStride];\
1792 const int src7= src[7*srcStride];\
1793 const int src8= src[8*srcStride];\
1794 const int src9= src[9*srcStride];\
1795 const int src10= src[10*srcStride];\
1796 const int src11= src[11*srcStride];\
1797 const int src12= src[12*srcStride];\
1798 const int src13= src[13*srcStride];\
1799 const int src14= src[14*srcStride];\
1800 const int src15= src[15*srcStride];\
1801 const int src16= src[16*srcStride];\
1802 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818 dst++;\
1819 src++;\
1823 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824 OPNAME ## pixels8_c(dst, src, stride, 8);\
1827 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828 uint8_t half[64];\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1833 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1837 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838 uint8_t half[64];\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1843 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1845 uint8_t half[64];\
1846 copy_block9(full, src, 16, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1851 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1857 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[16*9];\
1859 uint8_t half[64];\
1860 copy_block9(full, src, 16, stride, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1866 uint8_t halfH[72];\
1867 uint8_t halfV[64];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1877 uint8_t halfH[72];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1887 uint8_t halfH[72];\
1888 uint8_t halfV[64];\
1889 uint8_t halfHV[64];\
1890 copy_block9(full, src, 16, stride, 9);\
1891 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[16*9];\
1898 uint8_t halfH[72];\
1899 uint8_t halfHV[64];\
1900 copy_block9(full, src, 16, stride, 9);\
1901 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907 uint8_t full[16*9];\
1908 uint8_t halfH[72];\
1909 uint8_t halfV[64];\
1910 uint8_t halfHV[64];\
1911 copy_block9(full, src, 16, stride, 9);\
1912 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918 uint8_t full[16*9];\
1919 uint8_t halfH[72];\
1920 uint8_t halfHV[64];\
1921 copy_block9(full, src, 16, stride, 9);\
1922 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[16*9];\
1929 uint8_t halfH[72];\
1930 uint8_t halfV[64];\
1931 uint8_t halfHV[64];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1934 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[16*9];\
1940 uint8_t halfH[72];\
1941 uint8_t halfHV[64];\
1942 copy_block9(full, src, 16, stride, 9);\
1943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t halfH[72];\
1950 uint8_t halfHV[64];\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t halfH[72];\
1957 uint8_t halfHV[64];\
1958 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[16*9];\
1964 uint8_t halfH[72];\
1965 uint8_t halfV[64];\
1966 uint8_t halfHV[64];\
1967 copy_block9(full, src, 16, stride, 9);\
1968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[16*9];\
1975 uint8_t halfH[72];\
1976 copy_block9(full, src, 16, stride, 9);\
1977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t full[16*9];\
1983 uint8_t halfH[72];\
1984 uint8_t halfV[64];\
1985 uint8_t halfHV[64];\
1986 copy_block9(full, src, 16, stride, 9);\
1987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993 uint8_t full[16*9];\
1994 uint8_t halfH[72];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t halfH[72];\
2002 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006 OPNAME ## pixels16_c(dst, src, stride, 16);\
2009 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t half[256];\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2015 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2019 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t half[256];\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2025 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t half[256];\
2028 copy_block17(full, src, 24, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2033 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040 uint8_t full[24*17];\
2041 uint8_t half[256];\
2042 copy_block17(full, src, 24, stride, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfHV[256];\
2061 copy_block17(full, src, 24, stride, 17);\
2062 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2069 uint8_t halfH[272];\
2070 uint8_t halfV[256];\
2071 uint8_t halfHV[256];\
2072 copy_block17(full, src, 24, stride, 17);\
2073 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079 uint8_t full[24*17];\
2080 uint8_t halfH[272];\
2081 uint8_t halfHV[256];\
2082 copy_block17(full, src, 24, stride, 17);\
2083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089 uint8_t full[24*17];\
2090 uint8_t halfH[272];\
2091 uint8_t halfV[256];\
2092 uint8_t halfHV[256];\
2093 copy_block17(full, src, 24, stride, 17);\
2094 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100 uint8_t full[24*17];\
2101 uint8_t halfH[272];\
2102 uint8_t halfHV[256];\
2103 copy_block17(full, src, 24, stride, 17);\
2104 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110 uint8_t full[24*17];\
2111 uint8_t halfH[272];\
2112 uint8_t halfV[256];\
2113 uint8_t halfHV[256];\
2114 copy_block17(full, src, 24, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2116 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121 uint8_t full[24*17];\
2122 uint8_t halfH[272];\
2123 uint8_t halfHV[256];\
2124 copy_block17(full, src, 24, stride, 17);\
2125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131 uint8_t halfH[272];\
2132 uint8_t halfHV[256];\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t halfH[272];\
2139 uint8_t halfHV[256];\
2140 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145 uint8_t full[24*17];\
2146 uint8_t halfH[272];\
2147 uint8_t halfV[256];\
2148 uint8_t halfHV[256];\
2149 copy_block17(full, src, 24, stride, 17);\
2150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156 uint8_t full[24*17];\
2157 uint8_t halfH[272];\
2158 copy_block17(full, src, 24, stride, 17);\
2159 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164 uint8_t full[24*17];\
2165 uint8_t halfH[272];\
2166 uint8_t halfV[256];\
2167 uint8_t halfHV[256];\
2168 copy_block17(full, src, 24, stride, 17);\
2169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175 uint8_t full[24*17];\
2176 uint8_t halfH[272];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183 uint8_t halfH[272];\
2184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2188 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190 #define op_put(a, b) a = cm[((b) + 16)>>5]
2191 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193 QPEL_MC(0, put_ , _ , op_put)
2194 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195 QPEL_MC(0, avg_ , _ , op_avg)
2196 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2197 #undef op_avg
2198 #undef op_avg_no_rnd
2199 #undef op_put
2200 #undef op_put_no_rnd
2202 #if 1
2203 #define H264_LOWPASS(OPNAME, OP, OP2) \
2204 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 const int h=2;\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207 int i;\
2208 for(i=0; i<h; i++)\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212 dst+=dstStride;\
2213 src+=srcStride;\
2217 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218 const int w=2;\
2219 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220 int i;\
2221 for(i=0; i<w; i++)\
2223 const int srcB= src[-2*srcStride];\
2224 const int srcA= src[-1*srcStride];\
2225 const int src0= src[0 *srcStride];\
2226 const int src1= src[1 *srcStride];\
2227 const int src2= src[2 *srcStride];\
2228 const int src3= src[3 *srcStride];\
2229 const int src4= src[4 *srcStride];\
2230 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232 dst++;\
2233 src++;\
2237 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238 const int h=2;\
2239 const int w=2;\
2240 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241 int i;\
2242 src -= 2*srcStride;\
2243 for(i=0; i<h+5; i++)\
2245 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247 tmp+=tmpStride;\
2248 src+=srcStride;\
2250 tmp -= tmpStride*(h+5-2);\
2251 for(i=0; i<w; i++)\
2253 const int tmpB= tmp[-2*tmpStride];\
2254 const int tmpA= tmp[-1*tmpStride];\
2255 const int tmp0= tmp[0 *tmpStride];\
2256 const int tmp1= tmp[1 *tmpStride];\
2257 const int tmp2= tmp[2 *tmpStride];\
2258 const int tmp3= tmp[3 *tmpStride];\
2259 const int tmp4= tmp[4 *tmpStride];\
2260 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262 dst++;\
2263 tmp++;\
2266 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267 const int h=4;\
2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269 int i;\
2270 for(i=0; i<h; i++)\
2272 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276 dst+=dstStride;\
2277 src+=srcStride;\
2281 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282 const int w=4;\
2283 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284 int i;\
2285 for(i=0; i<w; i++)\
2287 const int srcB= src[-2*srcStride];\
2288 const int srcA= src[-1*srcStride];\
2289 const int src0= src[0 *srcStride];\
2290 const int src1= src[1 *srcStride];\
2291 const int src2= src[2 *srcStride];\
2292 const int src3= src[3 *srcStride];\
2293 const int src4= src[4 *srcStride];\
2294 const int src5= src[5 *srcStride];\
2295 const int src6= src[6 *srcStride];\
2296 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300 dst++;\
2301 src++;\
2305 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306 const int h=4;\
2307 const int w=4;\
2308 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309 int i;\
2310 src -= 2*srcStride;\
2311 for(i=0; i<h+5; i++)\
2313 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317 tmp+=tmpStride;\
2318 src+=srcStride;\
2320 tmp -= tmpStride*(h+5-2);\
2321 for(i=0; i<w; i++)\
2323 const int tmpB= tmp[-2*tmpStride];\
2324 const int tmpA= tmp[-1*tmpStride];\
2325 const int tmp0= tmp[0 *tmpStride];\
2326 const int tmp1= tmp[1 *tmpStride];\
2327 const int tmp2= tmp[2 *tmpStride];\
2328 const int tmp3= tmp[3 *tmpStride];\
2329 const int tmp4= tmp[4 *tmpStride];\
2330 const int tmp5= tmp[5 *tmpStride];\
2331 const int tmp6= tmp[6 *tmpStride];\
2332 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336 dst++;\
2337 tmp++;\
2341 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342 const int h=8;\
2343 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344 int i;\
2345 for(i=0; i<h; i++)\
2347 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355 dst+=dstStride;\
2356 src+=srcStride;\
2360 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361 const int w=8;\
2362 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363 int i;\
2364 for(i=0; i<w; i++)\
2366 const int srcB= src[-2*srcStride];\
2367 const int srcA= src[-1*srcStride];\
2368 const int src0= src[0 *srcStride];\
2369 const int src1= src[1 *srcStride];\
2370 const int src2= src[2 *srcStride];\
2371 const int src3= src[3 *srcStride];\
2372 const int src4= src[4 *srcStride];\
2373 const int src5= src[5 *srcStride];\
2374 const int src6= src[6 *srcStride];\
2375 const int src7= src[7 *srcStride];\
2376 const int src8= src[8 *srcStride];\
2377 const int src9= src[9 *srcStride];\
2378 const int src10=src[10*srcStride];\
2379 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387 dst++;\
2388 src++;\
2392 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393 const int h=8;\
2394 const int w=8;\
2395 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396 int i;\
2397 src -= 2*srcStride;\
2398 for(i=0; i<h+5; i++)\
2400 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408 tmp+=tmpStride;\
2409 src+=srcStride;\
2411 tmp -= tmpStride*(h+5-2);\
2412 for(i=0; i<w; i++)\
2414 const int tmpB= tmp[-2*tmpStride];\
2415 const int tmpA= tmp[-1*tmpStride];\
2416 const int tmp0= tmp[0 *tmpStride];\
2417 const int tmp1= tmp[1 *tmpStride];\
2418 const int tmp2= tmp[2 *tmpStride];\
2419 const int tmp3= tmp[3 *tmpStride];\
2420 const int tmp4= tmp[4 *tmpStride];\
2421 const int tmp5= tmp[5 *tmpStride];\
2422 const int tmp6= tmp[6 *tmpStride];\
2423 const int tmp7= tmp[7 *tmpStride];\
2424 const int tmp8= tmp[8 *tmpStride];\
2425 const int tmp9= tmp[9 *tmpStride];\
2426 const int tmp10=tmp[10*tmpStride];\
2427 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435 dst++;\
2436 tmp++;\
2440 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2442 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443 src += 8*srcStride;\
2444 dst += 8*dstStride;\
2445 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2446 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2449 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2451 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452 src += 8*srcStride;\
2453 dst += 8*dstStride;\
2454 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2455 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2458 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461 src += 8*srcStride;\
2462 dst += 8*dstStride;\
2463 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2467 #define H264_MC(OPNAME, SIZE) \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473 uint8_t half[SIZE*SIZE];\
2474 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483 uint8_t half[SIZE*SIZE];\
2484 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489 uint8_t full[SIZE*(SIZE+5)];\
2490 uint8_t * const full_mid= full + SIZE*2;\
2491 uint8_t half[SIZE*SIZE];\
2492 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2493 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498 uint8_t full[SIZE*(SIZE+5)];\
2499 uint8_t * const full_mid= full + SIZE*2;\
2500 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2501 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2504 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505 uint8_t full[SIZE*(SIZE+5)];\
2506 uint8_t * const full_mid= full + SIZE*2;\
2507 uint8_t half[SIZE*SIZE];\
2508 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2509 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514 uint8_t full[SIZE*(SIZE+5)];\
2515 uint8_t * const full_mid= full + SIZE*2;\
2516 uint8_t halfH[SIZE*SIZE];\
2517 uint8_t halfV[SIZE*SIZE];\
2518 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2520 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525 uint8_t full[SIZE*(SIZE+5)];\
2526 uint8_t * const full_mid= full + SIZE*2;\
2527 uint8_t halfH[SIZE*SIZE];\
2528 uint8_t halfV[SIZE*SIZE];\
2529 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2531 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2535 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536 uint8_t full[SIZE*(SIZE+5)];\
2537 uint8_t * const full_mid= full + SIZE*2;\
2538 uint8_t halfH[SIZE*SIZE];\
2539 uint8_t halfV[SIZE*SIZE];\
2540 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2542 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547 uint8_t full[SIZE*(SIZE+5)];\
2548 uint8_t * const full_mid= full + SIZE*2;\
2549 uint8_t halfH[SIZE*SIZE];\
2550 uint8_t halfV[SIZE*SIZE];\
2551 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2553 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2557 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558 int16_t tmp[SIZE*(SIZE+5)];\
2559 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563 int16_t tmp[SIZE*(SIZE+5)];\
2564 uint8_t halfH[SIZE*SIZE];\
2565 uint8_t halfHV[SIZE*SIZE];\
2566 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572 int16_t tmp[SIZE*(SIZE+5)];\
2573 uint8_t halfH[SIZE*SIZE];\
2574 uint8_t halfHV[SIZE*SIZE];\
2575 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2580 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581 uint8_t full[SIZE*(SIZE+5)];\
2582 uint8_t * const full_mid= full + SIZE*2;\
2583 int16_t tmp[SIZE*(SIZE+5)];\
2584 uint8_t halfV[SIZE*SIZE];\
2585 uint8_t halfHV[SIZE*SIZE];\
2586 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2587 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2592 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593 uint8_t full[SIZE*(SIZE+5)];\
2594 uint8_t * const full_mid= full + SIZE*2;\
2595 int16_t tmp[SIZE*(SIZE+5)];\
2596 uint8_t halfV[SIZE*SIZE];\
2597 uint8_t halfHV[SIZE*SIZE];\
2598 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2599 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2604 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606 #define op_put(a, b) a = cm[((b) + 16)>>5]
2607 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2610 H264_LOWPASS(put_ , op_put, op2_put)
2611 H264_LOWPASS(avg_ , op_avg, op2_avg)
2612 H264_MC(put_, 2)
2613 H264_MC(put_, 4)
2614 H264_MC(put_, 8)
2615 H264_MC(put_, 16)
2616 H264_MC(avg_, 4)
2617 H264_MC(avg_, 8)
2618 H264_MC(avg_, 16)
2620 #undef op_avg
2621 #undef op_put
2622 #undef op2_avg
2623 #undef op2_put
2624 #endif
2626 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628 #define H264_WEIGHT(W,H) \
2629 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630 int y; \
2631 offset <<= log2_denom; \
2632 if(log2_denom) offset += 1<<(log2_denom-1); \
2633 for(y=0; y<H; y++, block += stride){ \
2634 op_scale1(0); \
2635 op_scale1(1); \
2636 if(W==2) continue; \
2637 op_scale1(2); \
2638 op_scale1(3); \
2639 if(W==4) continue; \
2640 op_scale1(4); \
2641 op_scale1(5); \
2642 op_scale1(6); \
2643 op_scale1(7); \
2644 if(W==8) continue; \
2645 op_scale1(8); \
2646 op_scale1(9); \
2647 op_scale1(10); \
2648 op_scale1(11); \
2649 op_scale1(12); \
2650 op_scale1(13); \
2651 op_scale1(14); \
2652 op_scale1(15); \
2655 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656 int y; \
2657 offset = ((offset + 1) | 1) << log2_denom; \
2658 for(y=0; y<H; y++, dst += stride, src += stride){ \
2659 op_scale2(0); \
2660 op_scale2(1); \
2661 if(W==2) continue; \
2662 op_scale2(2); \
2663 op_scale2(3); \
2664 if(W==4) continue; \
2665 op_scale2(4); \
2666 op_scale2(5); \
2667 op_scale2(6); \
2668 op_scale2(7); \
2669 if(W==8) continue; \
2670 op_scale2(8); \
2671 op_scale2(9); \
2672 op_scale2(10); \
2673 op_scale2(11); \
2674 op_scale2(12); \
2675 op_scale2(13); \
2676 op_scale2(14); \
2677 op_scale2(15); \
2681 H264_WEIGHT(16,16)
2682 H264_WEIGHT(16,8)
2683 H264_WEIGHT(8,16)
2684 H264_WEIGHT(8,8)
2685 H264_WEIGHT(8,4)
2686 H264_WEIGHT(4,8)
2687 H264_WEIGHT(4,4)
2688 H264_WEIGHT(4,2)
2689 H264_WEIGHT(2,4)
2690 H264_WEIGHT(2,2)
2692 #undef op_scale1
2693 #undef op_scale2
2694 #undef H264_WEIGHT
2696 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698 int i;
2700 for(i=0; i<h; i++){
2701 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709 dst+=dstStride;
2710 src+=srcStride;
2714 #ifdef CONFIG_CAVS_DECODER
2715 /* AVS specific */
2716 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719 put_pixels8_c(dst, src, stride, 8);
2721 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722 avg_pixels8_c(dst, src, stride, 8);
2724 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725 put_pixels16_c(dst, src, stride, 16);
2727 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728 avg_pixels16_c(dst, src, stride, 16);
2730 #endif /* CONFIG_CAVS_DECODER */
2732 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2733 /* VC-1 specific */
2734 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737 put_pixels8_c(dst, src, stride, 8);
2739 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743 /* H264 specific */
2744 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2747 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2748 int i;
2750 for(i=0; i<w; i++){
2751 const int src_1= src[ -srcStride];
2752 const int src0 = src[0 ];
2753 const int src1 = src[ srcStride];
2754 const int src2 = src[2*srcStride];
2755 const int src3 = src[3*srcStride];
2756 const int src4 = src[4*srcStride];
2757 const int src5 = src[5*srcStride];
2758 const int src6 = src[6*srcStride];
2759 const int src7 = src[7*srcStride];
2760 const int src8 = src[8*srcStride];
2761 const int src9 = src[9*srcStride];
2762 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2763 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2764 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2765 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2766 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2767 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2768 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2769 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2770 src++;
2771 dst++;
2775 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2776 put_pixels8_c(dst, src, stride, 8);
2779 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2780 uint8_t half[64];
2781 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2782 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2785 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2786 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2789 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2790 uint8_t half[64];
2791 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2792 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2795 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2796 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2799 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2800 uint8_t halfH[88];
2801 uint8_t halfV[64];
2802 uint8_t halfHV[64];
2803 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2804 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2805 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2806 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2808 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2809 uint8_t halfH[88];
2810 uint8_t halfV[64];
2811 uint8_t halfHV[64];
2812 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2813 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2814 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2815 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2817 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2818 uint8_t halfH[88];
2819 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2820 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2823 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2824 if(ENABLE_ANY_H263) {
2825 int x;
2826 const int strength= ff_h263_loop_filter_strength[qscale];
2828 for(x=0; x<8; x++){
2829 int d1, d2, ad1;
2830 int p0= src[x-2*stride];
2831 int p1= src[x-1*stride];
2832 int p2= src[x+0*stride];
2833 int p3= src[x+1*stride];
2834 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2836 if (d<-2*strength) d1= 0;
2837 else if(d<- strength) d1=-2*strength - d;
2838 else if(d< strength) d1= d;
2839 else if(d< 2*strength) d1= 2*strength - d;
2840 else d1= 0;
2842 p1 += d1;
2843 p2 -= d1;
2844 if(p1&256) p1= ~(p1>>31);
2845 if(p2&256) p2= ~(p2>>31);
2847 src[x-1*stride] = p1;
2848 src[x+0*stride] = p2;
2850 ad1= FFABS(d1)>>1;
2852 d2= av_clip((p0-p3)/4, -ad1, ad1);
2854 src[x-2*stride] = p0 - d2;
2855 src[x+ stride] = p3 + d2;
2860 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2861 if(ENABLE_ANY_H263) {
2862 int y;
2863 const int strength= ff_h263_loop_filter_strength[qscale];
2865 for(y=0; y<8; y++){
2866 int d1, d2, ad1;
2867 int p0= src[y*stride-2];
2868 int p1= src[y*stride-1];
2869 int p2= src[y*stride+0];
2870 int p3= src[y*stride+1];
2871 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2873 if (d<-2*strength) d1= 0;
2874 else if(d<- strength) d1=-2*strength - d;
2875 else if(d< strength) d1= d;
2876 else if(d< 2*strength) d1= 2*strength - d;
2877 else d1= 0;
2879 p1 += d1;
2880 p2 -= d1;
2881 if(p1&256) p1= ~(p1>>31);
2882 if(p2&256) p2= ~(p2>>31);
2884 src[y*stride-1] = p1;
2885 src[y*stride+0] = p2;
2887 ad1= FFABS(d1)>>1;
2889 d2= av_clip((p0-p3)/4, -ad1, ad1);
2891 src[y*stride-2] = p0 - d2;
2892 src[y*stride+1] = p3 + d2;
2897 static void h261_loop_filter_c(uint8_t *src, int stride){
2898 int x,y,xy,yz;
2899 int temp[64];
2901 for(x=0; x<8; x++){
2902 temp[x ] = 4*src[x ];
2903 temp[x + 7*8] = 4*src[x + 7*stride];
2905 for(y=1; y<7; y++){
2906 for(x=0; x<8; x++){
2907 xy = y * stride + x;
2908 yz = y * 8 + x;
2909 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2913 for(y=0; y<8; y++){
2914 src[ y*stride] = (temp[ y*8] + 2)>>2;
2915 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2916 for(x=1; x<7; x++){
2917 xy = y * stride + x;
2918 yz = y * 8 + x;
2919 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2924 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2926 int i, d;
2927 for( i = 0; i < 4; i++ ) {
2928 if( tc0[i] < 0 ) {
2929 pix += 4*ystride;
2930 continue;
2932 for( d = 0; d < 4; d++ ) {
2933 const int p0 = pix[-1*xstride];
2934 const int p1 = pix[-2*xstride];
2935 const int p2 = pix[-3*xstride];
2936 const int q0 = pix[0];
2937 const int q1 = pix[1*xstride];
2938 const int q2 = pix[2*xstride];
2940 if( FFABS( p0 - q0 ) < alpha &&
2941 FFABS( p1 - p0 ) < beta &&
2942 FFABS( q1 - q0 ) < beta ) {
2944 int tc = tc0[i];
2945 int i_delta;
2947 if( FFABS( p2 - p0 ) < beta ) {
2948 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2949 tc++;
2951 if( FFABS( q2 - q0 ) < beta ) {
2952 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2953 tc++;
2956 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2957 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2958 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2960 pix += ystride;
2964 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2966 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2968 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2970 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2973 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2975 int i, d;
2976 for( i = 0; i < 4; i++ ) {
2977 const int tc = tc0[i];
2978 if( tc <= 0 ) {
2979 pix += 2*ystride;
2980 continue;
2982 for( d = 0; d < 2; d++ ) {
2983 const int p0 = pix[-1*xstride];
2984 const int p1 = pix[-2*xstride];
2985 const int q0 = pix[0];
2986 const int q1 = pix[1*xstride];
2988 if( FFABS( p0 - q0 ) < alpha &&
2989 FFABS( p1 - p0 ) < beta &&
2990 FFABS( q1 - q0 ) < beta ) {
2992 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2994 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2995 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2997 pix += ystride;
3001 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3003 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3005 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3007 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3010 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3012 int d;
3013 for( d = 0; d < 8; d++ ) {
3014 const int p0 = pix[-1*xstride];
3015 const int p1 = pix[-2*xstride];
3016 const int q0 = pix[0];
3017 const int q1 = pix[1*xstride];
3019 if( FFABS( p0 - q0 ) < alpha &&
3020 FFABS( p1 - p0 ) < beta &&
3021 FFABS( q1 - q0 ) < beta ) {
3023 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3024 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3026 pix += ystride;
3029 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3031 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3033 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3035 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3038 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3040 int s, i;
3042 s = 0;
3043 for(i=0;i<h;i++) {
3044 s += abs(pix1[0] - pix2[0]);
3045 s += abs(pix1[1] - pix2[1]);
3046 s += abs(pix1[2] - pix2[2]);
3047 s += abs(pix1[3] - pix2[3]);
3048 s += abs(pix1[4] - pix2[4]);
3049 s += abs(pix1[5] - pix2[5]);
3050 s += abs(pix1[6] - pix2[6]);
3051 s += abs(pix1[7] - pix2[7]);
3052 s += abs(pix1[8] - pix2[8]);
3053 s += abs(pix1[9] - pix2[9]);
3054 s += abs(pix1[10] - pix2[10]);
3055 s += abs(pix1[11] - pix2[11]);
3056 s += abs(pix1[12] - pix2[12]);
3057 s += abs(pix1[13] - pix2[13]);
3058 s += abs(pix1[14] - pix2[14]);
3059 s += abs(pix1[15] - pix2[15]);
3060 pix1 += line_size;
3061 pix2 += line_size;
3063 return s;
3066 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3068 int s, i;
3070 s = 0;
3071 for(i=0;i<h;i++) {
3072 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3073 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3074 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3075 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3076 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3077 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3078 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3079 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3080 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3081 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3082 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3083 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3084 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3085 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3086 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3087 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3088 pix1 += line_size;
3089 pix2 += line_size;
3091 return s;
3094 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3096 int s, i;
3097 uint8_t *pix3 = pix2 + line_size;
3099 s = 0;
3100 for(i=0;i<h;i++) {
3101 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3102 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3103 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3104 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3105 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3106 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3107 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3108 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3109 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3110 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3111 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3112 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3113 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3114 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3115 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3116 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3117 pix1 += line_size;
3118 pix2 += line_size;
3119 pix3 += line_size;
3121 return s;
3124 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3126 int s, i;
3127 uint8_t *pix3 = pix2 + line_size;
3129 s = 0;
3130 for(i=0;i<h;i++) {
3131 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3132 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3133 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3134 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3135 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3136 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3137 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3138 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3139 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3140 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3141 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3142 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3143 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3144 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3145 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3146 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3147 pix1 += line_size;
3148 pix2 += line_size;
3149 pix3 += line_size;
3151 return s;
3154 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3156 int s, i;
3158 s = 0;
3159 for(i=0;i<h;i++) {
3160 s += abs(pix1[0] - pix2[0]);
3161 s += abs(pix1[1] - pix2[1]);
3162 s += abs(pix1[2] - pix2[2]);
3163 s += abs(pix1[3] - pix2[3]);
3164 s += abs(pix1[4] - pix2[4]);
3165 s += abs(pix1[5] - pix2[5]);
3166 s += abs(pix1[6] - pix2[6]);
3167 s += abs(pix1[7] - pix2[7]);
3168 pix1 += line_size;
3169 pix2 += line_size;
3171 return s;
3174 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3176 int s, i;
3178 s = 0;
3179 for(i=0;i<h;i++) {
3180 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3181 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3182 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3183 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3184 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3185 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3186 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3187 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3188 pix1 += line_size;
3189 pix2 += line_size;
3191 return s;
3194 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3196 int s, i;
3197 uint8_t *pix3 = pix2 + line_size;
3199 s = 0;
3200 for(i=0;i<h;i++) {
3201 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3202 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3203 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3204 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3205 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3206 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3207 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3208 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3209 pix1 += line_size;
3210 pix2 += line_size;
3211 pix3 += line_size;
3213 return s;
3216 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3218 int s, i;
3219 uint8_t *pix3 = pix2 + line_size;
3221 s = 0;
3222 for(i=0;i<h;i++) {
3223 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3224 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3225 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3226 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3227 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3228 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3229 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3230 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3231 pix1 += line_size;
3232 pix2 += line_size;
3233 pix3 += line_size;
3235 return s;
3238 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3239 MpegEncContext *c = v;
3240 int score1=0;
3241 int score2=0;
3242 int x,y;
3244 for(y=0; y<h; y++){
3245 for(x=0; x<16; x++){
3246 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3248 if(y+1<h){
3249 for(x=0; x<15; x++){
3250 score2+= FFABS( s1[x ] - s1[x +stride]
3251 - s1[x+1] + s1[x+1+stride])
3252 -FFABS( s2[x ] - s2[x +stride]
3253 - s2[x+1] + s2[x+1+stride]);
3256 s1+= stride;
3257 s2+= stride;
3260 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3261 else return score1 + FFABS(score2)*8;
3264 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3265 MpegEncContext *c = v;
3266 int score1=0;
3267 int score2=0;
3268 int x,y;
3270 for(y=0; y<h; y++){
3271 for(x=0; x<8; x++){
3272 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3274 if(y+1<h){
3275 for(x=0; x<7; x++){
3276 score2+= FFABS( s1[x ] - s1[x +stride]
3277 - s1[x+1] + s1[x+1+stride])
3278 -FFABS( s2[x ] - s2[x +stride]
3279 - s2[x+1] + s2[x+1+stride]);
3282 s1+= stride;
3283 s2+= stride;
3286 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3287 else return score1 + FFABS(score2)*8;
3290 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3291 int i;
3292 unsigned int sum=0;
3294 for(i=0; i<8*8; i++){
3295 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3296 int w= weight[i];
3297 b>>= RECON_SHIFT;
3298 assert(-512<b && b<512);
3300 sum += (w*b)*(w*b)>>4;
3302 return sum>>2;
3305 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3306 int i;
3308 for(i=0; i<8*8; i++){
3309 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3314 * permutes an 8x8 block.
3315 * @param block the block which will be permuted according to the given permutation vector
3316 * @param permutation the permutation vector
3317 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3318 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3319 * (inverse) permutated to scantable order!
3321 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3323 int i;
3324 DCTELEM temp[64];
3326 if(last<=0) return;
3327 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3329 for(i=0; i<=last; i++){
3330 const int j= scantable[i];
3331 temp[j]= block[j];
3332 block[j]=0;
3335 for(i=0; i<=last; i++){
3336 const int j= scantable[i];
3337 const int perm_j= permutation[j];
3338 block[perm_j]= temp[j];
3342 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3343 return 0;
3346 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3347 int i;
3349 memset(cmp, 0, sizeof(void*)*5);
3351 for(i=0; i<5; i++){
3352 switch(type&0xFF){
3353 case FF_CMP_SAD:
3354 cmp[i]= c->sad[i];
3355 break;
3356 case FF_CMP_SATD:
3357 cmp[i]= c->hadamard8_diff[i];
3358 break;
3359 case FF_CMP_SSE:
3360 cmp[i]= c->sse[i];
3361 break;
3362 case FF_CMP_DCT:
3363 cmp[i]= c->dct_sad[i];
3364 break;
3365 case FF_CMP_DCT264:
3366 cmp[i]= c->dct264_sad[i];
3367 break;
3368 case FF_CMP_DCTMAX:
3369 cmp[i]= c->dct_max[i];
3370 break;
3371 case FF_CMP_PSNR:
3372 cmp[i]= c->quant_psnr[i];
3373 break;
3374 case FF_CMP_BIT:
3375 cmp[i]= c->bit[i];
3376 break;
3377 case FF_CMP_RD:
3378 cmp[i]= c->rd[i];
3379 break;
3380 case FF_CMP_VSAD:
3381 cmp[i]= c->vsad[i];
3382 break;
3383 case FF_CMP_VSSE:
3384 cmp[i]= c->vsse[i];
3385 break;
3386 case FF_CMP_ZERO:
3387 cmp[i]= zero_cmp;
3388 break;
3389 case FF_CMP_NSSE:
3390 cmp[i]= c->nsse[i];
3391 break;
3392 #ifdef CONFIG_SNOW_ENCODER
3393 case FF_CMP_W53:
3394 cmp[i]= c->w53[i];
3395 break;
3396 case FF_CMP_W97:
3397 cmp[i]= c->w97[i];
3398 break;
3399 #endif
3400 default:
3401 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3407 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3409 static void clear_blocks_c(DCTELEM *blocks)
3411 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3414 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3415 long i;
3416 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3417 long a = *(long*)(src+i);
3418 long b = *(long*)(dst+i);
3419 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3421 for(; i<w; i++)
3422 dst[i+0] += src[i+0];
3425 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3426 long i;
3427 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3428 long a = *(long*)(src1+i);
3429 long b = *(long*)(src2+i);
3430 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3432 for(; i<w; i++)
3433 dst[i] = src1[i]+src2[i];
3436 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3437 long i;
3438 #ifndef HAVE_FAST_UNALIGNED
3439 if((long)src2 & (sizeof(long)-1)){
3440 for(i=0; i+7<w; i+=8){
3441 dst[i+0] = src1[i+0]-src2[i+0];
3442 dst[i+1] = src1[i+1]-src2[i+1];
3443 dst[i+2] = src1[i+2]-src2[i+2];
3444 dst[i+3] = src1[i+3]-src2[i+3];
3445 dst[i+4] = src1[i+4]-src2[i+4];
3446 dst[i+5] = src1[i+5]-src2[i+5];
3447 dst[i+6] = src1[i+6]-src2[i+6];
3448 dst[i+7] = src1[i+7]-src2[i+7];
3450 }else
3451 #endif
3452 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3453 long a = *(long*)(src1+i);
3454 long b = *(long*)(src2+i);
3455 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3457 for(; i<w; i++)
3458 dst[i+0] = src1[i+0]-src2[i+0];
3461 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3462 int i;
3463 uint8_t l, lt;
3465 l= *left;
3466 lt= *left_top;
3468 for(i=0; i<w; i++){
3469 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3470 lt= src1[i];
3471 l= src2[i];
3472 dst[i]= l - pred;
3475 *left= l;
3476 *left_top= lt;
3479 #define BUTTERFLY2(o1,o2,i1,i2) \
3480 o1= (i1)+(i2);\
3481 o2= (i1)-(i2);
3483 #define BUTTERFLY1(x,y) \
3485 int a,b;\
3486 a= x;\
3487 b= y;\
3488 x= a+b;\
3489 y= a-b;\
3492 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3494 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3495 int i;
3496 int temp[64];
3497 int sum=0;
3499 assert(h==8);
3501 for(i=0; i<8; i++){
3502 //FIXME try pointer walks
3503 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3504 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3505 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3506 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3508 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3509 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3510 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3511 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3513 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3514 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3515 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3516 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3519 for(i=0; i<8; i++){
3520 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3521 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3522 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3523 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3525 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3526 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3527 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3528 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3530 sum +=
3531 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3532 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3533 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3534 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3536 #if 0
3537 static int maxi=0;
3538 if(sum>maxi){
3539 maxi=sum;
3540 printf("MAX:%d\n", maxi);
3542 #endif
3543 return sum;
3546 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3547 int i;
3548 int temp[64];
3549 int sum=0;
3551 assert(h==8);
3553 for(i=0; i<8; i++){
3554 //FIXME try pointer walks
3555 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3556 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3557 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3558 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3560 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3561 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3562 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3563 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3565 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3566 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3567 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3568 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3571 for(i=0; i<8; i++){
3572 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3573 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3574 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3575 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3577 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3578 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3579 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3580 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3582 sum +=
3583 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3584 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3585 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3586 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3589 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3591 return sum;
3594 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3595 MpegEncContext * const s= (MpegEncContext *)c;
3596 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3597 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3599 assert(h==8);
3601 s->dsp.diff_pixels(temp, src1, src2, stride);
3602 s->dsp.fdct(temp);
3603 return s->dsp.sum_abs_dctelem(temp);
3606 #ifdef CONFIG_GPL
3607 #define DCT8_1D {\
3608 const int s07 = SRC(0) + SRC(7);\
3609 const int s16 = SRC(1) + SRC(6);\
3610 const int s25 = SRC(2) + SRC(5);\
3611 const int s34 = SRC(3) + SRC(4);\
3612 const int a0 = s07 + s34;\
3613 const int a1 = s16 + s25;\
3614 const int a2 = s07 - s34;\
3615 const int a3 = s16 - s25;\
3616 const int d07 = SRC(0) - SRC(7);\
3617 const int d16 = SRC(1) - SRC(6);\
3618 const int d25 = SRC(2) - SRC(5);\
3619 const int d34 = SRC(3) - SRC(4);\
3620 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3621 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3622 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3623 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3624 DST(0, a0 + a1 ) ;\
3625 DST(1, a4 + (a7>>2)) ;\
3626 DST(2, a2 + (a3>>1)) ;\
3627 DST(3, a5 + (a6>>2)) ;\
3628 DST(4, a0 - a1 ) ;\
3629 DST(5, a6 - (a5>>2)) ;\
3630 DST(6, (a2>>1) - a3 ) ;\
3631 DST(7, (a4>>2) - a7 ) ;\
3634 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3635 MpegEncContext * const s= (MpegEncContext *)c;
3636 DCTELEM dct[8][8];
3637 int i;
3638 int sum=0;
3640 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3642 #define SRC(x) dct[i][x]
3643 #define DST(x,v) dct[i][x]= v
3644 for( i = 0; i < 8; i++ )
3645 DCT8_1D
3646 #undef SRC
3647 #undef DST
3649 #define SRC(x) dct[x][i]
3650 #define DST(x,v) sum += FFABS(v)
3651 for( i = 0; i < 8; i++ )
3652 DCT8_1D
3653 #undef SRC
3654 #undef DST
3655 return sum;
3657 #endif
3659 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3660 MpegEncContext * const s= (MpegEncContext *)c;
3661 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3662 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3663 int sum=0, i;
3665 assert(h==8);
3667 s->dsp.diff_pixels(temp, src1, src2, stride);
3668 s->dsp.fdct(temp);
3670 for(i=0; i<64; i++)
3671 sum= FFMAX(sum, FFABS(temp[i]));
3673 return sum;
3676 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3677 MpegEncContext * const s= (MpegEncContext *)c;
3678 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3679 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3680 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3681 int sum=0, i;
3683 assert(h==8);
3684 s->mb_intra=0;
3686 s->dsp.diff_pixels(temp, src1, src2, stride);
3688 memcpy(bak, temp, 64*sizeof(DCTELEM));
3690 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3691 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3692 ff_simple_idct(temp); //FIXME
3694 for(i=0; i<64; i++)
3695 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3697 return sum;
3700 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3701 MpegEncContext * const s= (MpegEncContext *)c;
3702 const uint8_t *scantable= s->intra_scantable.permutated;
3703 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3704 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3705 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3706 uint8_t * const bak= (uint8_t*)aligned_bak;
3707 int i, last, run, bits, level, distortion, start_i;
3708 const int esc_length= s->ac_esc_length;
3709 uint8_t * length;
3710 uint8_t * last_length;
3712 assert(h==8);
3714 for(i=0; i<8; i++){
3715 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3716 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3719 s->dsp.diff_pixels(temp, src1, src2, stride);
3721 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3723 bits=0;
3725 if (s->mb_intra) {
3726 start_i = 1;
3727 length = s->intra_ac_vlc_length;
3728 last_length= s->intra_ac_vlc_last_length;
3729 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3730 } else {
3731 start_i = 0;
3732 length = s->inter_ac_vlc_length;
3733 last_length= s->inter_ac_vlc_last_length;
3736 if(last>=start_i){
3737 run=0;
3738 for(i=start_i; i<last; i++){
3739 int j= scantable[i];
3740 level= temp[j];
3742 if(level){
3743 level+=64;
3744 if((level&(~127)) == 0){
3745 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3746 }else
3747 bits+= esc_length;
3748 run=0;
3749 }else
3750 run++;
3752 i= scantable[last];
3754 level= temp[i] + 64;
3756 assert(level - 64);
3758 if((level&(~127)) == 0){
3759 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3760 }else
3761 bits+= esc_length;
3765 if(last>=0){
3766 if(s->mb_intra)
3767 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3768 else
3769 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3772 s->dsp.idct_add(bak, stride, temp);
3774 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3776 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3779 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3780 MpegEncContext * const s= (MpegEncContext *)c;
3781 const uint8_t *scantable= s->intra_scantable.permutated;
3782 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3783 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3784 int i, last, run, bits, level, start_i;
3785 const int esc_length= s->ac_esc_length;
3786 uint8_t * length;
3787 uint8_t * last_length;
3789 assert(h==8);
3791 s->dsp.diff_pixels(temp, src1, src2, stride);
3793 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3795 bits=0;
3797 if (s->mb_intra) {
3798 start_i = 1;
3799 length = s->intra_ac_vlc_length;
3800 last_length= s->intra_ac_vlc_last_length;
3801 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3802 } else {
3803 start_i = 0;
3804 length = s->inter_ac_vlc_length;
3805 last_length= s->inter_ac_vlc_last_length;
3808 if(last>=start_i){
3809 run=0;
3810 for(i=start_i; i<last; i++){
3811 int j= scantable[i];
3812 level= temp[j];
3814 if(level){
3815 level+=64;
3816 if((level&(~127)) == 0){
3817 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3818 }else
3819 bits+= esc_length;
3820 run=0;
3821 }else
3822 run++;
3824 i= scantable[last];
3826 level= temp[i] + 64;
3828 assert(level - 64);
3830 if((level&(~127)) == 0){
3831 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3832 }else
3833 bits+= esc_length;
3836 return bits;
3839 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3840 int score=0;
3841 int x,y;
3843 for(y=1; y<h; y++){
3844 for(x=0; x<16; x+=4){
3845 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3846 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3848 s+= stride;
3851 return score;
3854 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3855 int score=0;
3856 int x,y;
3858 for(y=1; y<h; y++){
3859 for(x=0; x<16; x++){
3860 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3862 s1+= stride;
3863 s2+= stride;
3866 return score;
3869 #define SQ(a) ((a)*(a))
3870 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3871 int score=0;
3872 int x,y;
3874 for(y=1; y<h; y++){
3875 for(x=0; x<16; x+=4){
3876 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3877 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3879 s+= stride;
3882 return score;
3885 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3886 int score=0;
3887 int x,y;
3889 for(y=1; y<h; y++){
3890 for(x=0; x<16; x++){
3891 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3893 s1+= stride;
3894 s2+= stride;
3897 return score;
3900 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3901 int size){
3902 int score=0;
3903 int i;
3904 for(i=0; i<size; i++)
3905 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3906 return score;
3909 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3910 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3911 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3912 #ifdef CONFIG_GPL
3913 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3914 #endif
3915 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3916 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3917 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3918 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3920 static void vector_fmul_c(float *dst, const float *src, int len){
3921 int i;
3922 for(i=0; i<len; i++)
3923 dst[i] *= src[i];
3926 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3927 int i;
3928 src1 += len-1;
3929 for(i=0; i<len; i++)
3930 dst[i] = src0[i] * src1[-i];
3933 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3934 int i;
3935 for(i=0; i<len; i++)
3936 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3939 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3940 int i,j;
3941 dst += len;
3942 win += len;
3943 src0+= len;
3944 for(i=-len, j=len-1; i<0; i++, j--) {
3945 float s0 = src0[i];
3946 float s1 = src1[j];
3947 float wi = win[i];
3948 float wj = win[j];
3949 dst[i] = s0*wj - s1*wi + add_bias;
3950 dst[j] = s0*wi + s1*wj + add_bias;
3954 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3955 int i;
3956 for(i=0; i<len; i++)
3957 dst[i] = src[i] * mul;
3960 static av_always_inline int float_to_int16_one(const float *src){
3961 int_fast32_t tmp = *(const int32_t*)src;
3962 if(tmp & 0xf0000){
3963 tmp = (0x43c0ffff - tmp)>>31;
3964 // is this faster on some gcc/cpu combinations?
3965 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3966 // else tmp = 0;
3968 return tmp - 0x8000;
3971 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3972 int i;
3973 for(i=0; i<len; i++)
3974 dst[i] = float_to_int16_one(src+i);
3977 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3978 int i,j,c;
3979 if(channels==2){
3980 for(i=0; i<len; i++){
3981 dst[2*i] = float_to_int16_one(src[0]+i);
3982 dst[2*i+1] = float_to_int16_one(src[1]+i);
3984 }else{
3985 for(c=0; c<channels; c++)
3986 for(i=0, j=c; i<len; i++, j+=channels)
3987 dst[j] = float_to_int16_one(src[c]+i);
3991 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3993 while (order--)
3994 *v1++ += *v2++;
3997 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3999 while (order--)
4000 *v1++ -= *v2++;
4003 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4005 int res = 0;
4007 while (order--)
4008 res += (*v1++ * *v2++) >> shift;
4010 return res;
4013 #define W0 2048
4014 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4015 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4016 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4017 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4018 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4019 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4020 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4022 static void wmv2_idct_row(short * b)
4024 int s1,s2;
4025 int a0,a1,a2,a3,a4,a5,a6,a7;
4026 /*step 1*/
4027 a1 = W1*b[1]+W7*b[7];
4028 a7 = W7*b[1]-W1*b[7];
4029 a5 = W5*b[5]+W3*b[3];
4030 a3 = W3*b[5]-W5*b[3];
4031 a2 = W2*b[2]+W6*b[6];
4032 a6 = W6*b[2]-W2*b[6];
4033 a0 = W0*b[0]+W0*b[4];
4034 a4 = W0*b[0]-W0*b[4];
4035 /*step 2*/
4036 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4037 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4038 /*step 3*/
4039 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4040 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4041 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4042 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4043 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4044 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4045 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4046 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4048 static void wmv2_idct_col(short * b)
4050 int s1,s2;
4051 int a0,a1,a2,a3,a4,a5,a6,a7;
4052 /*step 1, with extended precision*/
4053 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4054 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4055 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4056 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4057 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4058 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4059 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4060 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4061 /*step 2*/
4062 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4063 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4064 /*step 3*/
4065 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4066 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4067 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4068 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4070 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4071 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4072 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4073 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4075 void ff_wmv2_idct_c(short * block){
4076 int i;
4078 for(i=0;i<64;i+=8){
4079 wmv2_idct_row(block+i);
4081 for(i=0;i<8;i++){
4082 wmv2_idct_col(block+i);
4085 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4086 converted */
4087 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4089 ff_wmv2_idct_c(block);
4090 put_pixels_clamped_c(block, dest, line_size);
4092 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4094 ff_wmv2_idct_c(block);
4095 add_pixels_clamped_c(block, dest, line_size);
4097 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4099 j_rev_dct (block);
4100 put_pixels_clamped_c(block, dest, line_size);
4102 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4104 j_rev_dct (block);
4105 add_pixels_clamped_c(block, dest, line_size);
4108 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4110 j_rev_dct4 (block);
4111 put_pixels_clamped4_c(block, dest, line_size);
4113 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4115 j_rev_dct4 (block);
4116 add_pixels_clamped4_c(block, dest, line_size);
4119 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4121 j_rev_dct2 (block);
4122 put_pixels_clamped2_c(block, dest, line_size);
4124 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4126 j_rev_dct2 (block);
4127 add_pixels_clamped2_c(block, dest, line_size);
4130 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4132 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4134 dest[0] = cm[(block[0] + 4)>>3];
4136 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4138 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4140 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4143 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4145 /* init static data */
4146 void dsputil_static_init(void)
4148 int i;
4150 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4151 for(i=0;i<MAX_NEG_CROP;i++) {
4152 ff_cropTbl[i] = 0;
4153 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4156 for(i=0;i<512;i++) {
4157 ff_squareTbl[i] = (i - 256) * (i - 256);
4160 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4163 int ff_check_alignment(void){
4164 static int did_fail=0;
4165 DECLARE_ALIGNED_16(int, aligned);
4167 if((long)&aligned & 15){
4168 if(!did_fail){
4169 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4170 av_log(NULL, AV_LOG_ERROR,
4171 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4172 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4173 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4174 "Do not report crashes to FFmpeg developers.\n");
4175 #endif
4176 did_fail=1;
4178 return -1;
4180 return 0;
4183 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4185 int i;
4187 ff_check_alignment();
4189 #ifdef CONFIG_ENCODERS
4190 if(avctx->dct_algo==FF_DCT_FASTINT) {
4191 c->fdct = fdct_ifast;
4192 c->fdct248 = fdct_ifast248;
4194 else if(avctx->dct_algo==FF_DCT_FAAN) {
4195 c->fdct = ff_faandct;
4196 c->fdct248 = ff_faandct248;
4198 else {
4199 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4200 c->fdct248 = ff_fdct248_islow;
4202 #endif //CONFIG_ENCODERS
4204 if(avctx->lowres==1){
4205 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4206 c->idct_put= ff_jref_idct4_put;
4207 c->idct_add= ff_jref_idct4_add;
4208 }else{
4209 c->idct_put= ff_h264_lowres_idct_put_c;
4210 c->idct_add= ff_h264_lowres_idct_add_c;
4212 c->idct = j_rev_dct4;
4213 c->idct_permutation_type= FF_NO_IDCT_PERM;
4214 }else if(avctx->lowres==2){
4215 c->idct_put= ff_jref_idct2_put;
4216 c->idct_add= ff_jref_idct2_add;
4217 c->idct = j_rev_dct2;
4218 c->idct_permutation_type= FF_NO_IDCT_PERM;
4219 }else if(avctx->lowres==3){
4220 c->idct_put= ff_jref_idct1_put;
4221 c->idct_add= ff_jref_idct1_add;
4222 c->idct = j_rev_dct1;
4223 c->idct_permutation_type= FF_NO_IDCT_PERM;
4224 }else{
4225 if(avctx->idct_algo==FF_IDCT_INT){
4226 c->idct_put= ff_jref_idct_put;
4227 c->idct_add= ff_jref_idct_add;
4228 c->idct = j_rev_dct;
4229 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4230 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4231 avctx->idct_algo==FF_IDCT_VP3){
4232 c->idct_put= ff_vp3_idct_put_c;
4233 c->idct_add= ff_vp3_idct_add_c;
4234 c->idct = ff_vp3_idct_c;
4235 c->idct_permutation_type= FF_NO_IDCT_PERM;
4236 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4237 c->idct_put= ff_wmv2_idct_put_c;
4238 c->idct_add= ff_wmv2_idct_add_c;
4239 c->idct = ff_wmv2_idct_c;
4240 c->idct_permutation_type= FF_NO_IDCT_PERM;
4241 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4242 c->idct_put= ff_faanidct_put;
4243 c->idct_add= ff_faanidct_add;
4244 c->idct = ff_faanidct;
4245 c->idct_permutation_type= FF_NO_IDCT_PERM;
4246 }else if(ENABLE_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4247 c->idct_put= ff_ea_idct_put_c;
4248 c->idct_permutation_type= FF_NO_IDCT_PERM;
4249 }else{ //accurate/default
4250 c->idct_put= ff_simple_idct_put;
4251 c->idct_add= ff_simple_idct_add;
4252 c->idct = ff_simple_idct;
4253 c->idct_permutation_type= FF_NO_IDCT_PERM;
4257 if (ENABLE_H264_DECODER) {
4258 c->h264_idct_add= ff_h264_idct_add_c;
4259 c->h264_idct8_add= ff_h264_idct8_add_c;
4260 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4261 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4264 c->get_pixels = get_pixels_c;
4265 c->diff_pixels = diff_pixels_c;
4266 c->put_pixels_clamped = put_pixels_clamped_c;
4267 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4268 c->add_pixels_clamped = add_pixels_clamped_c;
4269 c->add_pixels8 = add_pixels8_c;
4270 c->add_pixels4 = add_pixels4_c;
4271 c->sum_abs_dctelem = sum_abs_dctelem_c;
4272 c->gmc1 = gmc1_c;
4273 c->gmc = ff_gmc_c;
4274 c->clear_blocks = clear_blocks_c;
4275 c->pix_sum = pix_sum_c;
4276 c->pix_norm1 = pix_norm1_c;
4278 /* TODO [0] 16 [1] 8 */
4279 c->pix_abs[0][0] = pix_abs16_c;
4280 c->pix_abs[0][1] = pix_abs16_x2_c;
4281 c->pix_abs[0][2] = pix_abs16_y2_c;
4282 c->pix_abs[0][3] = pix_abs16_xy2_c;
4283 c->pix_abs[1][0] = pix_abs8_c;
4284 c->pix_abs[1][1] = pix_abs8_x2_c;
4285 c->pix_abs[1][2] = pix_abs8_y2_c;
4286 c->pix_abs[1][3] = pix_abs8_xy2_c;
4288 #define dspfunc(PFX, IDX, NUM) \
4289 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4290 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4291 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4292 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4294 dspfunc(put, 0, 16);
4295 dspfunc(put_no_rnd, 0, 16);
4296 dspfunc(put, 1, 8);
4297 dspfunc(put_no_rnd, 1, 8);
4298 dspfunc(put, 2, 4);
4299 dspfunc(put, 3, 2);
4301 dspfunc(avg, 0, 16);
4302 dspfunc(avg_no_rnd, 0, 16);
4303 dspfunc(avg, 1, 8);
4304 dspfunc(avg_no_rnd, 1, 8);
4305 dspfunc(avg, 2, 4);
4306 dspfunc(avg, 3, 2);
4307 #undef dspfunc
4309 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4310 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4312 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4313 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4314 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4315 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4316 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4317 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4318 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4319 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4320 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4322 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4323 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4324 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4325 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4326 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4327 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4328 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4329 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4330 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4332 #define dspfunc(PFX, IDX, NUM) \
4333 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4334 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4335 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4336 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4337 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4338 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4339 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4340 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4341 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4342 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4343 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4344 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4345 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4346 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4347 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4348 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4350 dspfunc(put_qpel, 0, 16);
4351 dspfunc(put_no_rnd_qpel, 0, 16);
4353 dspfunc(avg_qpel, 0, 16);
4354 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4356 dspfunc(put_qpel, 1, 8);
4357 dspfunc(put_no_rnd_qpel, 1, 8);
4359 dspfunc(avg_qpel, 1, 8);
4360 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4362 dspfunc(put_h264_qpel, 0, 16);
4363 dspfunc(put_h264_qpel, 1, 8);
4364 dspfunc(put_h264_qpel, 2, 4);
4365 dspfunc(put_h264_qpel, 3, 2);
4366 dspfunc(avg_h264_qpel, 0, 16);
4367 dspfunc(avg_h264_qpel, 1, 8);
4368 dspfunc(avg_h264_qpel, 2, 4);
4370 #undef dspfunc
4371 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4372 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4373 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4374 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4375 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4376 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4377 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4379 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4380 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4381 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4382 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4383 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4384 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4385 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4386 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4387 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4388 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4389 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4390 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4391 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4392 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4393 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4394 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4395 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4396 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4397 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4398 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4400 c->draw_edges = draw_edges_c;
4402 #ifdef CONFIG_CAVS_DECODER
4403 ff_cavsdsp_init(c,avctx);
4404 #endif
4405 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4406 ff_vc1dsp_init(c,avctx);
4407 #endif
4408 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4409 ff_intrax8dsp_init(c,avctx);
4410 #endif
4411 #if defined(CONFIG_H264_ENCODER)
4412 ff_h264dspenc_init(c,avctx);
4413 #endif
4415 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4416 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4417 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4418 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4419 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4420 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4421 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4422 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4424 #define SET_CMP_FUNC(name) \
4425 c->name[0]= name ## 16_c;\
4426 c->name[1]= name ## 8x8_c;
4428 SET_CMP_FUNC(hadamard8_diff)
4429 c->hadamard8_diff[4]= hadamard8_intra16_c;
4430 SET_CMP_FUNC(dct_sad)
4431 SET_CMP_FUNC(dct_max)
4432 #ifdef CONFIG_GPL
4433 SET_CMP_FUNC(dct264_sad)
4434 #endif
4435 c->sad[0]= pix_abs16_c;
4436 c->sad[1]= pix_abs8_c;
4437 c->sse[0]= sse16_c;
4438 c->sse[1]= sse8_c;
4439 c->sse[2]= sse4_c;
4440 SET_CMP_FUNC(quant_psnr)
4441 SET_CMP_FUNC(rd)
4442 SET_CMP_FUNC(bit)
4443 c->vsad[0]= vsad16_c;
4444 c->vsad[4]= vsad_intra16_c;
4445 c->vsse[0]= vsse16_c;
4446 c->vsse[4]= vsse_intra16_c;
4447 c->nsse[0]= nsse16_c;
4448 c->nsse[1]= nsse8_c;
4449 #ifdef CONFIG_SNOW_ENCODER
4450 c->w53[0]= w53_16_c;
4451 c->w53[1]= w53_8_c;
4452 c->w97[0]= w97_16_c;
4453 c->w97[1]= w97_8_c;
4454 #endif
4456 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4458 c->add_bytes= add_bytes_c;
4459 c->add_bytes_l2= add_bytes_l2_c;
4460 c->diff_bytes= diff_bytes_c;
4461 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4462 c->bswap_buf= bswap_buf;
4463 #ifdef CONFIG_PNG_DECODER
4464 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4465 #endif
4467 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4468 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4469 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4470 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4471 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4472 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4473 c->h264_loop_filter_strength= NULL;
4475 if (ENABLE_ANY_H263) {
4476 c->h263_h_loop_filter= h263_h_loop_filter_c;
4477 c->h263_v_loop_filter= h263_v_loop_filter_c;
4480 if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4481 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4482 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4485 c->h261_loop_filter= h261_loop_filter_c;
4487 c->try_8x8basis= try_8x8basis_c;
4488 c->add_8x8basis= add_8x8basis_c;
4490 #ifdef CONFIG_SNOW_DECODER
4491 c->vertical_compose97i = ff_snow_vertical_compose97i;
4492 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4493 c->inner_add_yblock = ff_snow_inner_add_yblock;
4494 #endif
4496 #ifdef CONFIG_VORBIS_DECODER
4497 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4498 #endif
4499 #ifdef CONFIG_AC3_DECODER
4500 c->ac3_downmix = ff_ac3_downmix_c;
4501 #endif
4502 #ifdef CONFIG_FLAC_ENCODER
4503 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4504 #endif
4505 c->vector_fmul = vector_fmul_c;
4506 c->vector_fmul_reverse = vector_fmul_reverse_c;
4507 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4508 c->vector_fmul_window = ff_vector_fmul_window_c;
4509 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4510 c->float_to_int16 = ff_float_to_int16_c;
4511 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4512 c->add_int16 = add_int16_c;
4513 c->sub_int16 = sub_int16_c;
4514 c->scalarproduct_int16 = scalarproduct_int16_c;
4516 c->shrink[0]= ff_img_copy_plane;
4517 c->shrink[1]= ff_shrink22;
4518 c->shrink[2]= ff_shrink44;
4519 c->shrink[3]= ff_shrink88;
4521 c->prefetch= just_return;
4523 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4524 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4526 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4527 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4528 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4529 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4530 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4531 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4532 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4533 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4534 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4536 for(i=0; i<64; i++){
4537 if(!c->put_2tap_qpel_pixels_tab[0][i])
4538 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4539 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4540 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4543 switch(c->idct_permutation_type){
4544 case FF_NO_IDCT_PERM:
4545 for(i=0; i<64; i++)
4546 c->idct_permutation[i]= i;
4547 break;
4548 case FF_LIBMPEG2_IDCT_PERM:
4549 for(i=0; i<64; i++)
4550 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4551 break;
4552 case FF_SIMPLE_IDCT_PERM:
4553 for(i=0; i<64; i++)
4554 c->idct_permutation[i]= simple_mmx_permutation[i];
4555 break;
4556 case FF_TRANSPOSE_IDCT_PERM:
4557 for(i=0; i<64; i++)
4558 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4559 break;
4560 case FF_PARTTRANS_IDCT_PERM:
4561 for(i=0; i<64; i++)
4562 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4563 break;
4564 case FF_SSE2_IDCT_PERM:
4565 for(i=0; i<64; i++)
4566 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4567 break;
4568 default:
4569 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");