Remove unused types
[ffmpeg-lucabe.git] / libavcodec / dsputil.c
bloba97fb1e2cd9ac83476d07b633fcf8943e9da4e02
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "h263.h"
36 #include "snow.h"
38 /* snow.c */
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 /* vorbis.c */
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 /* ac3dec.c */
45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47 /* flacenc.c */
48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50 /* pngdec.c */
51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53 /* eaidct.c */
54 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
57 uint32_t ff_squareTbl[512] = {0, };
59 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
60 #define pb_7f (~0UL/255 * 0x7f)
61 #define pb_80 (~0UL/255 * 0x80)
63 const uint8_t ff_zigzag_direct[64] = {
64 0, 1, 8, 16, 9, 2, 3, 10,
65 17, 24, 32, 25, 18, 11, 4, 5,
66 12, 19, 26, 33, 40, 48, 41, 34,
67 27, 20, 13, 6, 7, 14, 21, 28,
68 35, 42, 49, 56, 57, 50, 43, 36,
69 29, 22, 15, 23, 30, 37, 44, 51,
70 58, 59, 52, 45, 38, 31, 39, 46,
71 53, 60, 61, 54, 47, 55, 62, 63
74 /* Specific zigzag scan for 248 idct. NOTE that unlike the
75 specification, we interleave the fields */
76 const uint8_t ff_zigzag248_direct[64] = {
77 0, 8, 1, 9, 16, 24, 2, 10,
78 17, 25, 32, 40, 48, 56, 33, 41,
79 18, 26, 3, 11, 4, 12, 19, 27,
80 34, 42, 49, 57, 50, 58, 35, 43,
81 20, 28, 5, 13, 6, 14, 21, 29,
82 36, 44, 51, 59, 52, 60, 37, 45,
83 22, 30, 7, 15, 23, 31, 38, 46,
84 53, 61, 54, 62, 39, 47, 55, 63,
87 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
88 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90 const uint8_t ff_alternate_horizontal_scan[64] = {
91 0, 1, 2, 3, 8, 9, 16, 17,
92 10, 11, 4, 5, 6, 7, 15, 14,
93 13, 12, 19, 18, 24, 25, 32, 33,
94 26, 27, 20, 21, 22, 23, 28, 29,
95 30, 31, 34, 35, 40, 41, 48, 49,
96 42, 43, 36, 37, 38, 39, 44, 45,
97 46, 47, 50, 51, 56, 57, 58, 59,
98 52, 53, 54, 55, 60, 61, 62, 63,
101 const uint8_t ff_alternate_vertical_scan[64] = {
102 0, 8, 16, 24, 1, 9, 2, 10,
103 17, 25, 32, 40, 48, 56, 57, 49,
104 41, 33, 26, 18, 3, 11, 4, 12,
105 19, 27, 34, 42, 50, 58, 35, 43,
106 51, 59, 20, 28, 5, 13, 6, 14,
107 21, 29, 36, 44, 52, 60, 37, 45,
108 53, 61, 22, 30, 7, 15, 23, 31,
109 38, 46, 54, 62, 39, 47, 55, 63,
112 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
113 const uint32_t ff_inverse[256]={
114 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
115 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
116 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
117 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
118 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
119 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
120 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
121 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
122 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
123 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
124 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
125 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
126 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
127 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
128 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
129 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
130 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
131 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
132 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
133 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
134 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
135 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
136 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
137 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
138 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
139 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
140 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
141 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
142 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
143 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
144 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
145 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
148 /* Input permutation for the simple_idct_mmx */
149 static const uint8_t simple_mmx_permutation[64]={
150 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
160 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
163 int i;
164 int end;
166 st->scantable= src_scantable;
168 for(i=0; i<64; i++){
169 int j;
170 j = src_scantable[i];
171 st->permutated[i] = permutation[j];
172 #ifdef ARCH_POWERPC
173 st->inverse[j] = i;
174 #endif
177 end=-1;
178 for(i=0; i<64; i++){
179 int j;
180 j = st->permutated[i];
181 if(j>end) end=j;
182 st->raster_end[i]= end;
186 static int pix_sum_c(uint8_t * pix, int line_size)
188 int s, i, j;
190 s = 0;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
193 s += pix[0];
194 s += pix[1];
195 s += pix[2];
196 s += pix[3];
197 s += pix[4];
198 s += pix[5];
199 s += pix[6];
200 s += pix[7];
201 pix += 8;
203 pix += line_size - 16;
205 return s;
208 static int pix_norm1_c(uint8_t * pix, int line_size)
210 int s, i, j;
211 uint32_t *sq = ff_squareTbl + 256;
213 s = 0;
214 for (i = 0; i < 16; i++) {
215 for (j = 0; j < 16; j += 8) {
216 #if 0
217 s += sq[pix[0]];
218 s += sq[pix[1]];
219 s += sq[pix[2]];
220 s += sq[pix[3]];
221 s += sq[pix[4]];
222 s += sq[pix[5]];
223 s += sq[pix[6]];
224 s += sq[pix[7]];
225 #else
226 #if LONG_MAX > 2147483647
227 register uint64_t x=*(uint64_t*)pix;
228 s += sq[x&0xff];
229 s += sq[(x>>8)&0xff];
230 s += sq[(x>>16)&0xff];
231 s += sq[(x>>24)&0xff];
232 s += sq[(x>>32)&0xff];
233 s += sq[(x>>40)&0xff];
234 s += sq[(x>>48)&0xff];
235 s += sq[(x>>56)&0xff];
236 #else
237 register uint32_t x=*(uint32_t*)pix;
238 s += sq[x&0xff];
239 s += sq[(x>>8)&0xff];
240 s += sq[(x>>16)&0xff];
241 s += sq[(x>>24)&0xff];
242 x=*(uint32_t*)(pix+4);
243 s += sq[x&0xff];
244 s += sq[(x>>8)&0xff];
245 s += sq[(x>>16)&0xff];
246 s += sq[(x>>24)&0xff];
247 #endif
248 #endif
249 pix += 8;
251 pix += line_size - 16;
253 return s;
256 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
257 int i;
259 for(i=0; i+8<=w; i+=8){
260 dst[i+0]= bswap_32(src[i+0]);
261 dst[i+1]= bswap_32(src[i+1]);
262 dst[i+2]= bswap_32(src[i+2]);
263 dst[i+3]= bswap_32(src[i+3]);
264 dst[i+4]= bswap_32(src[i+4]);
265 dst[i+5]= bswap_32(src[i+5]);
266 dst[i+6]= bswap_32(src[i+6]);
267 dst[i+7]= bswap_32(src[i+7]);
269 for(;i<w; i++){
270 dst[i+0]= bswap_32(src[i+0]);
274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
276 int s, i;
277 uint32_t *sq = ff_squareTbl + 256;
279 s = 0;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
285 pix1 += line_size;
286 pix2 += line_size;
288 return s;
291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
293 int s, i;
294 uint32_t *sq = ff_squareTbl + 256;
296 s = 0;
297 for (i = 0; i < h; i++) {
298 s += sq[pix1[0] - pix2[0]];
299 s += sq[pix1[1] - pix2[1]];
300 s += sq[pix1[2] - pix2[2]];
301 s += sq[pix1[3] - pix2[3]];
302 s += sq[pix1[4] - pix2[4]];
303 s += sq[pix1[5] - pix2[5]];
304 s += sq[pix1[6] - pix2[6]];
305 s += sq[pix1[7] - pix2[7]];
306 pix1 += line_size;
307 pix2 += line_size;
309 return s;
312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
314 int s, i;
315 uint32_t *sq = ff_squareTbl + 256;
317 s = 0;
318 for (i = 0; i < h; i++) {
319 s += sq[pix1[ 0] - pix2[ 0]];
320 s += sq[pix1[ 1] - pix2[ 1]];
321 s += sq[pix1[ 2] - pix2[ 2]];
322 s += sq[pix1[ 3] - pix2[ 3]];
323 s += sq[pix1[ 4] - pix2[ 4]];
324 s += sq[pix1[ 5] - pix2[ 5]];
325 s += sq[pix1[ 6] - pix2[ 6]];
326 s += sq[pix1[ 7] - pix2[ 7]];
327 s += sq[pix1[ 8] - pix2[ 8]];
328 s += sq[pix1[ 9] - pix2[ 9]];
329 s += sq[pix1[10] - pix2[10]];
330 s += sq[pix1[11] - pix2[11]];
331 s += sq[pix1[12] - pix2[12]];
332 s += sq[pix1[13] - pix2[13]];
333 s += sq[pix1[14] - pix2[14]];
334 s += sq[pix1[15] - pix2[15]];
336 pix1 += line_size;
337 pix2 += line_size;
339 return s;
343 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
344 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
345 int s, i, j;
346 const int dec_count= w==8 ? 3 : 4;
347 int tmp[32*32];
348 int level, ori;
349 static const int scale[2][2][4][4]={
352 // 9/7 8x8 dec=3
353 {268, 239, 239, 213},
354 { 0, 224, 224, 152},
355 { 0, 135, 135, 110},
357 // 9/7 16x16 or 32x32 dec=4
358 {344, 310, 310, 280},
359 { 0, 320, 320, 228},
360 { 0, 175, 175, 136},
361 { 0, 129, 129, 102},
365 // 5/3 8x8 dec=3
366 {275, 245, 245, 218},
367 { 0, 230, 230, 156},
368 { 0, 138, 138, 113},
370 // 5/3 16x16 or 32x32 dec=4
371 {352, 317, 317, 286},
372 { 0, 328, 328, 233},
373 { 0, 180, 180, 140},
374 { 0, 132, 132, 105},
379 for (i = 0; i < h; i++) {
380 for (j = 0; j < w; j+=4) {
381 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
382 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
383 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
384 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
386 pix1 += line_size;
387 pix2 += line_size;
390 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
392 s=0;
393 assert(w==h);
394 for(level=0; level<dec_count; level++){
395 for(ori= level ? 1 : 0; ori<4; ori++){
396 int size= w>>(dec_count-level);
397 int sx= (ori&1) ? size : 0;
398 int stride= 32<<(dec_count-level);
399 int sy= (ori&2) ? stride>>1 : 0;
401 for(i=0; i<size; i++){
402 for(j=0; j<size; j++){
403 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
404 s += FFABS(v);
409 assert(s>=0);
410 return s>>9;
413 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414 return w_c(v, pix1, pix2, line_size, 8, h, 1);
417 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 0);
421 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 16, h, 1);
425 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 0);
429 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 32, h, 1);
433 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434 return w_c(v, pix1, pix2, line_size, 32, h, 0);
436 #endif
438 /* draw the edges of width 'w' of an image of size width, height */
439 //FIXME check that this is ok for mpeg4 interlaced
440 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442 uint8_t *ptr, *last_line;
443 int i;
445 last_line = buf + (height - 1) * wrap;
446 for(i=0;i<w;i++) {
447 /* top and bottom */
448 memcpy(buf - (i + 1) * wrap, buf, width);
449 memcpy(last_line + (i + 1) * wrap, last_line, width);
451 /* left and right */
452 ptr = buf;
453 for(i=0;i<height;i++) {
454 memset(ptr - w, ptr[0], w);
455 memset(ptr + width, ptr[width-1], w);
456 ptr += wrap;
458 /* corners */
459 for(i=0;i<w;i++) {
460 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
461 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
462 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
463 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
469 * @param buf destination buffer
470 * @param src source buffer
471 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
472 * @param block_w width of block
473 * @param block_h height of block
474 * @param src_x x coordinate of the top left sample of the block in the source buffer
475 * @param src_y y coordinate of the top left sample of the block in the source buffer
476 * @param w width of the source buffer
477 * @param h height of the source buffer
479 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
480 int src_x, int src_y, int w, int h){
481 int x, y;
482 int start_y, start_x, end_y, end_x;
484 if(src_y>= h){
485 src+= (h-1-src_y)*linesize;
486 src_y=h-1;
487 }else if(src_y<=-block_h){
488 src+= (1-block_h-src_y)*linesize;
489 src_y=1-block_h;
491 if(src_x>= w){
492 src+= (w-1-src_x);
493 src_x=w-1;
494 }else if(src_x<=-block_w){
495 src+= (1-block_w-src_x);
496 src_x=1-block_w;
499 start_y= FFMAX(0, -src_y);
500 start_x= FFMAX(0, -src_x);
501 end_y= FFMIN(block_h, h-src_y);
502 end_x= FFMIN(block_w, w-src_x);
504 // copy existing part
505 for(y=start_y; y<end_y; y++){
506 for(x=start_x; x<end_x; x++){
507 buf[x + y*linesize]= src[x + y*linesize];
511 //top
512 for(y=0; y<start_y; y++){
513 for(x=start_x; x<end_x; x++){
514 buf[x + y*linesize]= buf[x + start_y*linesize];
518 //bottom
519 for(y=end_y; y<block_h; y++){
520 for(x=start_x; x<end_x; x++){
521 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
525 for(y=0; y<block_h; y++){
526 //left
527 for(x=0; x<start_x; x++){
528 buf[x + y*linesize]= buf[start_x + y*linesize];
531 //right
532 for(x=end_x; x<block_w; x++){
533 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
540 int i;
542 /* read the pixels */
543 for(i=0;i<8;i++) {
544 block[0] = pixels[0];
545 block[1] = pixels[1];
546 block[2] = pixels[2];
547 block[3] = pixels[3];
548 block[4] = pixels[4];
549 block[5] = pixels[5];
550 block[6] = pixels[6];
551 block[7] = pixels[7];
552 pixels += line_size;
553 block += 8;
557 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
558 const uint8_t *s2, int stride){
559 int i;
561 /* read the pixels */
562 for(i=0;i<8;i++) {
563 block[0] = s1[0] - s2[0];
564 block[1] = s1[1] - s2[1];
565 block[2] = s1[2] - s2[2];
566 block[3] = s1[3] - s2[3];
567 block[4] = s1[4] - s2[4];
568 block[5] = s1[5] - s2[5];
569 block[6] = s1[6] - s2[6];
570 block[7] = s1[7] - s2[7];
571 s1 += stride;
572 s2 += stride;
573 block += 8;
578 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
579 int line_size)
581 int i;
582 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584 /* read the pixels */
585 for(i=0;i<8;i++) {
586 pixels[0] = cm[block[0]];
587 pixels[1] = cm[block[1]];
588 pixels[2] = cm[block[2]];
589 pixels[3] = cm[block[3]];
590 pixels[4] = cm[block[4]];
591 pixels[5] = cm[block[5]];
592 pixels[6] = cm[block[6]];
593 pixels[7] = cm[block[7]];
595 pixels += line_size;
596 block += 8;
600 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
601 int line_size)
603 int i;
604 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606 /* read the pixels */
607 for(i=0;i<4;i++) {
608 pixels[0] = cm[block[0]];
609 pixels[1] = cm[block[1]];
610 pixels[2] = cm[block[2]];
611 pixels[3] = cm[block[3]];
613 pixels += line_size;
614 block += 8;
618 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
619 int line_size)
621 int i;
622 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624 /* read the pixels */
625 for(i=0;i<2;i++) {
626 pixels[0] = cm[block[0]];
627 pixels[1] = cm[block[1]];
629 pixels += line_size;
630 block += 8;
634 static void put_signed_pixels_clamped_c(const DCTELEM *block,
635 uint8_t *restrict pixels,
636 int line_size)
638 int i, j;
640 for (i = 0; i < 8; i++) {
641 for (j = 0; j < 8; j++) {
642 if (*block < -128)
643 *pixels = 0;
644 else if (*block > 127)
645 *pixels = 255;
646 else
647 *pixels = (uint8_t)(*block + 128);
648 block++;
649 pixels++;
651 pixels += (line_size - 8);
655 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
656 int line_size)
658 int i;
659 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661 /* read the pixels */
662 for(i=0;i<8;i++) {
663 pixels[0] = cm[pixels[0] + block[0]];
664 pixels[1] = cm[pixels[1] + block[1]];
665 pixels[2] = cm[pixels[2] + block[2]];
666 pixels[3] = cm[pixels[3] + block[3]];
667 pixels[4] = cm[pixels[4] + block[4]];
668 pixels[5] = cm[pixels[5] + block[5]];
669 pixels[6] = cm[pixels[6] + block[6]];
670 pixels[7] = cm[pixels[7] + block[7]];
671 pixels += line_size;
672 block += 8;
676 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
677 int line_size)
679 int i;
680 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682 /* read the pixels */
683 for(i=0;i<4;i++) {
684 pixels[0] = cm[pixels[0] + block[0]];
685 pixels[1] = cm[pixels[1] + block[1]];
686 pixels[2] = cm[pixels[2] + block[2]];
687 pixels[3] = cm[pixels[3] + block[3]];
688 pixels += line_size;
689 block += 8;
693 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
694 int line_size)
696 int i;
697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699 /* read the pixels */
700 for(i=0;i<2;i++) {
701 pixels[0] = cm[pixels[0] + block[0]];
702 pixels[1] = cm[pixels[1] + block[1]];
703 pixels += line_size;
704 block += 8;
708 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
710 int i;
711 for(i=0;i<8;i++) {
712 pixels[0] += block[0];
713 pixels[1] += block[1];
714 pixels[2] += block[2];
715 pixels[3] += block[3];
716 pixels[4] += block[4];
717 pixels[5] += block[5];
718 pixels[6] += block[6];
719 pixels[7] += block[7];
720 pixels += line_size;
721 block += 8;
725 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
727 int i;
728 for(i=0;i<4;i++) {
729 pixels[0] += block[0];
730 pixels[1] += block[1];
731 pixels[2] += block[2];
732 pixels[3] += block[3];
733 pixels += line_size;
734 block += 4;
738 static int sum_abs_dctelem_c(DCTELEM *block)
740 int sum=0, i;
741 for(i=0; i<64; i++)
742 sum+= FFABS(block[i]);
743 return sum;
746 #if 0
748 #define PIXOP2(OPNAME, OP) \
749 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
751 int i;\
752 for(i=0; i<h; i++){\
753 OP(*((uint64_t*)block), AV_RN64(pixels));\
754 pixels+=line_size;\
755 block +=line_size;\
759 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
761 int i;\
762 for(i=0; i<h; i++){\
763 const uint64_t a= AV_RN64(pixels );\
764 const uint64_t b= AV_RN64(pixels+1);\
765 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
766 pixels+=line_size;\
767 block +=line_size;\
771 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
773 int i;\
774 for(i=0; i<h; i++){\
775 const uint64_t a= AV_RN64(pixels );\
776 const uint64_t b= AV_RN64(pixels+1);\
777 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
778 pixels+=line_size;\
779 block +=line_size;\
783 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
785 int i;\
786 for(i=0; i<h; i++){\
787 const uint64_t a= AV_RN64(pixels );\
788 const uint64_t b= AV_RN64(pixels+line_size);\
789 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
790 pixels+=line_size;\
791 block +=line_size;\
795 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
797 int i;\
798 for(i=0; i<h; i++){\
799 const uint64_t a= AV_RN64(pixels );\
800 const uint64_t b= AV_RN64(pixels+line_size);\
801 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
802 pixels+=line_size;\
803 block +=line_size;\
807 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
809 int i;\
810 const uint64_t a= AV_RN64(pixels );\
811 const uint64_t b= AV_RN64(pixels+1);\
812 uint64_t l0= (a&0x0303030303030303ULL)\
813 + (b&0x0303030303030303ULL)\
814 + 0x0202020202020202ULL;\
815 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
816 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
817 uint64_t l1,h1;\
819 pixels+=line_size;\
820 for(i=0; i<h; i+=2){\
821 uint64_t a= AV_RN64(pixels );\
822 uint64_t b= AV_RN64(pixels+1);\
823 l1= (a&0x0303030303030303ULL)\
824 + (b&0x0303030303030303ULL);\
825 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
826 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
827 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
828 pixels+=line_size;\
829 block +=line_size;\
830 a= AV_RN64(pixels );\
831 b= AV_RN64(pixels+1);\
832 l0= (a&0x0303030303030303ULL)\
833 + (b&0x0303030303030303ULL)\
834 + 0x0202020202020202ULL;\
835 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
836 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
837 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
838 pixels+=line_size;\
839 block +=line_size;\
843 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
845 int i;\
846 const uint64_t a= AV_RN64(pixels );\
847 const uint64_t b= AV_RN64(pixels+1);\
848 uint64_t l0= (a&0x0303030303030303ULL)\
849 + (b&0x0303030303030303ULL)\
850 + 0x0101010101010101ULL;\
851 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
852 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
853 uint64_t l1,h1;\
855 pixels+=line_size;\
856 for(i=0; i<h; i+=2){\
857 uint64_t a= AV_RN64(pixels );\
858 uint64_t b= AV_RN64(pixels+1);\
859 l1= (a&0x0303030303030303ULL)\
860 + (b&0x0303030303030303ULL);\
861 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
862 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
863 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
864 pixels+=line_size;\
865 block +=line_size;\
866 a= AV_RN64(pixels );\
867 b= AV_RN64(pixels+1);\
868 l0= (a&0x0303030303030303ULL)\
869 + (b&0x0303030303030303ULL)\
870 + 0x0101010101010101ULL;\
871 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
872 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
873 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
874 pixels+=line_size;\
875 block +=line_size;\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
888 #else // 64 bit variant
890 #define PIXOP2(OPNAME, OP) \
891 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 int i;\
893 for(i=0; i<h; i++){\
894 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
895 pixels+=line_size;\
896 block +=line_size;\
899 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 int i;\
901 for(i=0; i<h; i++){\
902 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
903 pixels+=line_size;\
904 block +=line_size;\
907 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 int i;\
909 for(i=0; i<h; i++){\
910 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
911 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
912 pixels+=line_size;\
913 block +=line_size;\
916 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
920 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921 int src_stride1, int src_stride2, int h){\
922 int i;\
923 for(i=0; i<h; i++){\
924 uint32_t a,b;\
925 a= AV_RN32(&src1[i*src_stride1 ]);\
926 b= AV_RN32(&src2[i*src_stride2 ]);\
927 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
928 a= AV_RN32(&src1[i*src_stride1+4]);\
929 b= AV_RN32(&src2[i*src_stride2+4]);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
934 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
935 int src_stride1, int src_stride2, int h){\
936 int i;\
937 for(i=0; i<h; i++){\
938 uint32_t a,b;\
939 a= AV_RN32(&src1[i*src_stride1 ]);\
940 b= AV_RN32(&src2[i*src_stride2 ]);\
941 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
942 a= AV_RN32(&src1[i*src_stride1+4]);\
943 b= AV_RN32(&src2[i*src_stride2+4]);\
944 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
948 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
949 int src_stride1, int src_stride2, int h){\
950 int i;\
951 for(i=0; i<h; i++){\
952 uint32_t a,b;\
953 a= AV_RN32(&src1[i*src_stride1 ]);\
954 b= AV_RN32(&src2[i*src_stride2 ]);\
955 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
959 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
960 int src_stride1, int src_stride2, int h){\
961 int i;\
962 for(i=0; i<h; i++){\
963 uint32_t a,b;\
964 a= AV_RN16(&src1[i*src_stride1 ]);\
965 b= AV_RN16(&src2[i*src_stride2 ]);\
966 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
970 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971 int src_stride1, int src_stride2, int h){\
972 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
973 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
976 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
977 int src_stride1, int src_stride2, int h){\
978 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
979 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
982 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
994 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000 int i;\
1001 for(i=0; i<h; i++){\
1002 uint32_t a, b, c, d, l0, l1, h0, h1;\
1003 a= AV_RN32(&src1[i*src_stride1]);\
1004 b= AV_RN32(&src2[i*src_stride2]);\
1005 c= AV_RN32(&src3[i*src_stride3]);\
1006 d= AV_RN32(&src4[i*src_stride4]);\
1007 l0= (a&0x03030303UL)\
1008 + (b&0x03030303UL)\
1009 + 0x02020202UL;\
1010 h0= ((a&0xFCFCFCFCUL)>>2)\
1011 + ((b&0xFCFCFCFCUL)>>2);\
1012 l1= (c&0x03030303UL)\
1013 + (d&0x03030303UL);\
1014 h1= ((c&0xFCFCFCFCUL)>>2)\
1015 + ((d&0xFCFCFCFCUL)>>2);\
1016 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017 a= AV_RN32(&src1[i*src_stride1+4]);\
1018 b= AV_RN32(&src2[i*src_stride2+4]);\
1019 c= AV_RN32(&src3[i*src_stride3+4]);\
1020 d= AV_RN32(&src4[i*src_stride4+4]);\
1021 l0= (a&0x03030303UL)\
1022 + (b&0x03030303UL)\
1023 + 0x02020202UL;\
1024 h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1026 l1= (c&0x03030303UL)\
1027 + (d&0x03030303UL);\
1028 h1= ((c&0xFCFCFCFCUL)>>2)\
1029 + ((d&0xFCFCFCFCUL)>>2);\
1030 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1046 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1050 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1052 int i;\
1053 for(i=0; i<h; i++){\
1054 uint32_t a, b, c, d, l0, l1, h0, h1;\
1055 a= AV_RN32(&src1[i*src_stride1]);\
1056 b= AV_RN32(&src2[i*src_stride2]);\
1057 c= AV_RN32(&src3[i*src_stride3]);\
1058 d= AV_RN32(&src4[i*src_stride4]);\
1059 l0= (a&0x03030303UL)\
1060 + (b&0x03030303UL)\
1061 + 0x01010101UL;\
1062 h0= ((a&0xFCFCFCFCUL)>>2)\
1063 + ((b&0xFCFCFCFCUL)>>2);\
1064 l1= (c&0x03030303UL)\
1065 + (d&0x03030303UL);\
1066 h1= ((c&0xFCFCFCFCUL)>>2)\
1067 + ((d&0xFCFCFCFCUL)>>2);\
1068 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069 a= AV_RN32(&src1[i*src_stride1+4]);\
1070 b= AV_RN32(&src2[i*src_stride2+4]);\
1071 c= AV_RN32(&src3[i*src_stride3+4]);\
1072 d= AV_RN32(&src4[i*src_stride4+4]);\
1073 l0= (a&0x03030303UL)\
1074 + (b&0x03030303UL)\
1075 + 0x01010101UL;\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 l1= (c&0x03030303UL)\
1079 + (d&0x03030303UL);\
1080 h1= ((c&0xFCFCFCFCUL)>>2)\
1081 + ((d&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1085 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1096 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 int i, a0, b0, a1, b1;\
1099 a0= pixels[0];\
1100 b0= pixels[1] + 2;\
1101 a0 += b0;\
1102 b0 += pixels[2];\
1104 pixels+=line_size;\
1105 for(i=0; i<h; i+=2){\
1106 a1= pixels[0];\
1107 b1= pixels[1];\
1108 a1 += b1;\
1109 b1 += pixels[2];\
1111 block[0]= (a1+a0)>>2; /* FIXME non put */\
1112 block[1]= (b1+b0)>>2;\
1114 pixels+=line_size;\
1115 block +=line_size;\
1117 a0= pixels[0];\
1118 b0= pixels[1] + 2;\
1119 a0 += b0;\
1120 b0 += pixels[2];\
1122 block[0]= (a1+a0)>>2;\
1123 block[1]= (b1+b0)>>2;\
1124 pixels+=line_size;\
1125 block +=line_size;\
1129 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 int i;\
1132 const uint32_t a= AV_RN32(pixels );\
1133 const uint32_t b= AV_RN32(pixels+1);\
1134 uint32_t l0= (a&0x03030303UL)\
1135 + (b&0x03030303UL)\
1136 + 0x02020202UL;\
1137 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138 + ((b&0xFCFCFCFCUL)>>2);\
1139 uint32_t l1,h1;\
1141 pixels+=line_size;\
1142 for(i=0; i<h; i+=2){\
1143 uint32_t a= AV_RN32(pixels );\
1144 uint32_t b= AV_RN32(pixels+1);\
1145 l1= (a&0x03030303UL)\
1146 + (b&0x03030303UL);\
1147 h1= ((a&0xFCFCFCFCUL)>>2)\
1148 + ((b&0xFCFCFCFCUL)>>2);\
1149 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150 pixels+=line_size;\
1151 block +=line_size;\
1152 a= AV_RN32(pixels );\
1153 b= AV_RN32(pixels+1);\
1154 l0= (a&0x03030303UL)\
1155 + (b&0x03030303UL)\
1156 + 0x02020202UL;\
1157 h0= ((a&0xFCFCFCFCUL)>>2)\
1158 + ((b&0xFCFCFCFCUL)>>2);\
1159 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1160 pixels+=line_size;\
1161 block +=line_size;\
1165 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1167 int j;\
1168 for(j=0; j<2; j++){\
1169 int i;\
1170 const uint32_t a= AV_RN32(pixels );\
1171 const uint32_t b= AV_RN32(pixels+1);\
1172 uint32_t l0= (a&0x03030303UL)\
1173 + (b&0x03030303UL)\
1174 + 0x02020202UL;\
1175 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176 + ((b&0xFCFCFCFCUL)>>2);\
1177 uint32_t l1,h1;\
1179 pixels+=line_size;\
1180 for(i=0; i<h; i+=2){\
1181 uint32_t a= AV_RN32(pixels );\
1182 uint32_t b= AV_RN32(pixels+1);\
1183 l1= (a&0x03030303UL)\
1184 + (b&0x03030303UL);\
1185 h1= ((a&0xFCFCFCFCUL)>>2)\
1186 + ((b&0xFCFCFCFCUL)>>2);\
1187 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1188 pixels+=line_size;\
1189 block +=line_size;\
1190 a= AV_RN32(pixels );\
1191 b= AV_RN32(pixels+1);\
1192 l0= (a&0x03030303UL)\
1193 + (b&0x03030303UL)\
1194 + 0x02020202UL;\
1195 h0= ((a&0xFCFCFCFCUL)>>2)\
1196 + ((b&0xFCFCFCFCUL)>>2);\
1197 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1198 pixels+=line_size;\
1199 block +=line_size;\
1201 pixels+=4-line_size*(h+1);\
1202 block +=4-line_size*h;\
1206 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1208 int j;\
1209 for(j=0; j<2; j++){\
1210 int i;\
1211 const uint32_t a= AV_RN32(pixels );\
1212 const uint32_t b= AV_RN32(pixels+1);\
1213 uint32_t l0= (a&0x03030303UL)\
1214 + (b&0x03030303UL)\
1215 + 0x01010101UL;\
1216 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217 + ((b&0xFCFCFCFCUL)>>2);\
1218 uint32_t l1,h1;\
1220 pixels+=line_size;\
1221 for(i=0; i<h; i+=2){\
1222 uint32_t a= AV_RN32(pixels );\
1223 uint32_t b= AV_RN32(pixels+1);\
1224 l1= (a&0x03030303UL)\
1225 + (b&0x03030303UL);\
1226 h1= ((a&0xFCFCFCFCUL)>>2)\
1227 + ((b&0xFCFCFCFCUL)>>2);\
1228 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1229 pixels+=line_size;\
1230 block +=line_size;\
1231 a= AV_RN32(pixels );\
1232 b= AV_RN32(pixels+1);\
1233 l0= (a&0x03030303UL)\
1234 + (b&0x03030303UL)\
1235 + 0x01010101UL;\
1236 h0= ((a&0xFCFCFCFCUL)>>2)\
1237 + ((b&0xFCFCFCFCUL)>>2);\
1238 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1239 pixels+=line_size;\
1240 block +=line_size;\
1242 pixels+=4-line_size*(h+1);\
1243 block +=4-line_size*h;\
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256 #define op_avg(a, b) a = rnd_avg32(a, b)
1257 #endif
1258 #define op_put(a, b) a = b
1260 PIXOP2(avg, op_avg)
1261 PIXOP2(put, op_put)
1262 #undef op_avg
1263 #undef op_put
1265 #define avg2(a,b) ((a+b+1)>>1)
1266 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1272 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1276 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 const int A=(16-x16)*(16-y16);
1279 const int B=( x16)*(16-y16);
1280 const int C=(16-x16)*( y16);
1281 const int D=( x16)*( y16);
1282 int i;
1284 for(i=0; i<h; i++)
1286 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1294 dst+= stride;
1295 src+= stride;
1299 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1302 int y, vx, vy;
1303 const int s= 1<<shift;
1305 width--;
1306 height--;
1308 for(y=0; y<h; y++){
1309 int x;
1311 vx= ox;
1312 vy= oy;
1313 for(x=0; x<8; x++){ //XXX FIXME optimize
1314 int src_x, src_y, frac_x, frac_y, index;
1316 src_x= vx>>16;
1317 src_y= vy>>16;
1318 frac_x= src_x&(s-1);
1319 frac_y= src_y&(s-1);
1320 src_x>>=shift;
1321 src_y>>=shift;
1323 if((unsigned)src_x < width){
1324 if((unsigned)src_y < height){
1325 index= src_x + src_y*stride;
1326 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1327 + src[index +1]* frac_x )*(s-frac_y)
1328 + ( src[index+stride ]*(s-frac_x)
1329 + src[index+stride+1]* frac_x )* frac_y
1330 + r)>>(shift*2);
1331 }else{
1332 index= src_x + av_clip(src_y, 0, height)*stride;
1333 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1334 + src[index +1]* frac_x )*s
1335 + r)>>(shift*2);
1337 }else{
1338 if((unsigned)src_y < height){
1339 index= av_clip(src_x, 0, width) + src_y*stride;
1340 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1341 + src[index+stride ]* frac_y )*s
1342 + r)>>(shift*2);
1343 }else{
1344 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345 dst[y*stride + x]= src[index ];
1349 vx+= dxx;
1350 vy+= dyx;
1352 ox += dxy;
1353 oy += dyy;
1357 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 switch(width){
1359 case 2: put_pixels2_c (dst, src, stride, height); break;
1360 case 4: put_pixels4_c (dst, src, stride, height); break;
1361 case 8: put_pixels8_c (dst, src, stride, height); break;
1362 case 16:put_pixels16_c(dst, src, stride, height); break;
1366 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 int i,j;
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1372 src += stride;
1373 dst += stride;
1377 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 int i,j;
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1383 src += stride;
1384 dst += stride;
1388 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 int i,j;
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1394 src += stride;
1395 dst += stride;
1399 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400 int i,j;
1401 for (i=0; i < height; i++) {
1402 for (j=0; j < width; j++) {
1403 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1405 src += stride;
1406 dst += stride;
1410 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411 int i,j;
1412 for (i=0; i < height; i++) {
1413 for (j=0; j < width; j++) {
1414 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1416 src += stride;
1417 dst += stride;
1421 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422 int i,j;
1423 for (i=0; i < height; i++) {
1424 for (j=0; j < width; j++) {
1425 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1427 src += stride;
1428 dst += stride;
1432 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433 int i,j;
1434 for (i=0; i < height; i++) {
1435 for (j=0; j < width; j++) {
1436 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1438 src += stride;
1439 dst += stride;
1443 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444 int i,j;
1445 for (i=0; i < height; i++) {
1446 for (j=0; j < width; j++) {
1447 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1449 src += stride;
1450 dst += stride;
1454 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455 switch(width){
1456 case 2: avg_pixels2_c (dst, src, stride, height); break;
1457 case 4: avg_pixels4_c (dst, src, stride, height); break;
1458 case 8: avg_pixels8_c (dst, src, stride, height); break;
1459 case 16:avg_pixels16_c(dst, src, stride, height); break;
1463 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464 int i,j;
1465 for (i=0; i < height; i++) {
1466 for (j=0; j < width; j++) {
1467 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1469 src += stride;
1470 dst += stride;
1474 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475 int i,j;
1476 for (i=0; i < height; i++) {
1477 for (j=0; j < width; j++) {
1478 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1480 src += stride;
1481 dst += stride;
1485 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1486 int i,j;
1487 for (i=0; i < height; i++) {
1488 for (j=0; j < width; j++) {
1489 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1491 src += stride;
1492 dst += stride;
1496 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1497 int i,j;
1498 for (i=0; i < height; i++) {
1499 for (j=0; j < width; j++) {
1500 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1502 src += stride;
1503 dst += stride;
1507 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1508 int i,j;
1509 for (i=0; i < height; i++) {
1510 for (j=0; j < width; j++) {
1511 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1513 src += stride;
1514 dst += stride;
1518 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1519 int i,j;
1520 for (i=0; i < height; i++) {
1521 for (j=0; j < width; j++) {
1522 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1524 src += stride;
1525 dst += stride;
1529 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1530 int i,j;
1531 for (i=0; i < height; i++) {
1532 for (j=0; j < width; j++) {
1533 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1535 src += stride;
1536 dst += stride;
1540 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1541 int i,j;
1542 for (i=0; i < height; i++) {
1543 for (j=0; j < width; j++) {
1544 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1546 src += stride;
1547 dst += stride;
1550 #if 0
1551 #define TPEL_WIDTH(width)\
1552 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1570 #endif
1572 #define H264_CHROMA_MC(OPNAME, OP)\
1573 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574 const int A=(8-x)*(8-y);\
1575 const int B=( x)*(8-y);\
1576 const int C=(8-x)*( y);\
1577 const int D=( x)*( y);\
1578 int i;\
1580 assert(x<8 && y<8 && x>=0 && y>=0);\
1582 if(D){\
1583 for(i=0; i<h; i++){\
1584 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1586 dst+= stride;\
1587 src+= stride;\
1589 }else{\
1590 const int E= B+C;\
1591 const int step= C ? stride : 1;\
1592 for(i=0; i<h; i++){\
1593 OP(dst[0], (A*src[0] + E*src[step+0]));\
1594 OP(dst[1], (A*src[1] + E*src[step+1]));\
1595 dst+= stride;\
1596 src+= stride;\
1601 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602 const int A=(8-x)*(8-y);\
1603 const int B=( x)*(8-y);\
1604 const int C=(8-x)*( y);\
1605 const int D=( x)*( y);\
1606 int i;\
1608 assert(x<8 && y<8 && x>=0 && y>=0);\
1610 if(D){\
1611 for(i=0; i<h; i++){\
1612 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1616 dst+= stride;\
1617 src+= stride;\
1619 }else{\
1620 const int E= B+C;\
1621 const int step= C ? stride : 1;\
1622 for(i=0; i<h; i++){\
1623 OP(dst[0], (A*src[0] + E*src[step+0]));\
1624 OP(dst[1], (A*src[1] + E*src[step+1]));\
1625 OP(dst[2], (A*src[2] + E*src[step+2]));\
1626 OP(dst[3], (A*src[3] + E*src[step+3]));\
1627 dst+= stride;\
1628 src+= stride;\
1633 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634 const int A=(8-x)*(8-y);\
1635 const int B=( x)*(8-y);\
1636 const int C=(8-x)*( y);\
1637 const int D=( x)*( y);\
1638 int i;\
1640 assert(x<8 && y<8 && x>=0 && y>=0);\
1642 if(D){\
1643 for(i=0; i<h; i++){\
1644 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1652 dst+= stride;\
1653 src+= stride;\
1655 }else{\
1656 const int E= B+C;\
1657 const int step= C ? stride : 1;\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + E*src[step+0]));\
1660 OP(dst[1], (A*src[1] + E*src[step+1]));\
1661 OP(dst[2], (A*src[2] + E*src[step+2]));\
1662 OP(dst[3], (A*src[3] + E*src[step+3]));\
1663 OP(dst[4], (A*src[4] + E*src[step+4]));\
1664 OP(dst[5], (A*src[5] + E*src[step+5]));\
1665 OP(dst[6], (A*src[6] + E*src[step+6]));\
1666 OP(dst[7], (A*src[7] + E*src[step+7]));\
1667 dst+= stride;\
1668 src+= stride;\
1673 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674 #define op_put(a, b) a = (((b) + 32)>>6)
1676 H264_CHROMA_MC(put_ , op_put)
1677 H264_CHROMA_MC(avg_ , op_avg)
1678 #undef op_avg
1679 #undef op_put
1681 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682 const int A=(8-x)*(8-y);
1683 const int B=( x)*(8-y);
1684 const int C=(8-x)*( y);
1685 const int D=( x)*( y);
1686 int i;
1688 assert(x<8 && y<8 && x>=0 && y>=0);
1690 for(i=0; i<h; i++)
1692 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1700 dst+= stride;
1701 src+= stride;
1705 #define QPEL_MC(r, OPNAME, RND, OP) \
1706 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1708 int i;\
1709 for(i=0; i<h; i++)\
1711 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1719 dst+=dstStride;\
1720 src+=srcStride;\
1724 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1725 const int w=8;\
1726 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1727 int i;\
1728 for(i=0; i<w; i++)\
1730 const int src0= src[0*srcStride];\
1731 const int src1= src[1*srcStride];\
1732 const int src2= src[2*srcStride];\
1733 const int src3= src[3*srcStride];\
1734 const int src4= src[4*srcStride];\
1735 const int src5= src[5*srcStride];\
1736 const int src6= src[6*srcStride];\
1737 const int src7= src[7*srcStride];\
1738 const int src8= src[8*srcStride];\
1739 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1747 dst++;\
1748 src++;\
1752 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1754 int i;\
1756 for(i=0; i<h; i++)\
1758 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1774 dst+=dstStride;\
1775 src+=srcStride;\
1779 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1781 int i;\
1782 const int w=16;\
1783 for(i=0; i<w; i++)\
1785 const int src0= src[0*srcStride];\
1786 const int src1= src[1*srcStride];\
1787 const int src2= src[2*srcStride];\
1788 const int src3= src[3*srcStride];\
1789 const int src4= src[4*srcStride];\
1790 const int src5= src[5*srcStride];\
1791 const int src6= src[6*srcStride];\
1792 const int src7= src[7*srcStride];\
1793 const int src8= src[8*srcStride];\
1794 const int src9= src[9*srcStride];\
1795 const int src10= src[10*srcStride];\
1796 const int src11= src[11*srcStride];\
1797 const int src12= src[12*srcStride];\
1798 const int src13= src[13*srcStride];\
1799 const int src14= src[14*srcStride];\
1800 const int src15= src[15*srcStride];\
1801 const int src16= src[16*srcStride];\
1802 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1818 dst++;\
1819 src++;\
1823 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824 OPNAME ## pixels8_c(dst, src, stride, 8);\
1827 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828 uint8_t half[64];\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1833 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1837 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838 uint8_t half[64];\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1843 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1845 uint8_t half[64];\
1846 copy_block9(full, src, 16, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1851 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1857 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[16*9];\
1859 uint8_t half[64];\
1860 copy_block9(full, src, 16, stride, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1866 uint8_t halfH[72];\
1867 uint8_t halfV[64];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1877 uint8_t halfH[72];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1887 uint8_t halfH[72];\
1888 uint8_t halfV[64];\
1889 uint8_t halfHV[64];\
1890 copy_block9(full, src, 16, stride, 9);\
1891 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[16*9];\
1898 uint8_t halfH[72];\
1899 uint8_t halfHV[64];\
1900 copy_block9(full, src, 16, stride, 9);\
1901 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907 uint8_t full[16*9];\
1908 uint8_t halfH[72];\
1909 uint8_t halfV[64];\
1910 uint8_t halfHV[64];\
1911 copy_block9(full, src, 16, stride, 9);\
1912 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918 uint8_t full[16*9];\
1919 uint8_t halfH[72];\
1920 uint8_t halfHV[64];\
1921 copy_block9(full, src, 16, stride, 9);\
1922 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[16*9];\
1929 uint8_t halfH[72];\
1930 uint8_t halfV[64];\
1931 uint8_t halfHV[64];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1934 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[16*9];\
1940 uint8_t halfH[72];\
1941 uint8_t halfHV[64];\
1942 copy_block9(full, src, 16, stride, 9);\
1943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t halfH[72];\
1950 uint8_t halfHV[64];\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t halfH[72];\
1957 uint8_t halfHV[64];\
1958 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[16*9];\
1964 uint8_t halfH[72];\
1965 uint8_t halfV[64];\
1966 uint8_t halfHV[64];\
1967 copy_block9(full, src, 16, stride, 9);\
1968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[16*9];\
1975 uint8_t halfH[72];\
1976 copy_block9(full, src, 16, stride, 9);\
1977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t full[16*9];\
1983 uint8_t halfH[72];\
1984 uint8_t halfV[64];\
1985 uint8_t halfHV[64];\
1986 copy_block9(full, src, 16, stride, 9);\
1987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993 uint8_t full[16*9];\
1994 uint8_t halfH[72];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t halfH[72];\
2002 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006 OPNAME ## pixels16_c(dst, src, stride, 16);\
2009 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t half[256];\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2015 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2019 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t half[256];\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2025 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t half[256];\
2028 copy_block17(full, src, 24, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2033 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040 uint8_t full[24*17];\
2041 uint8_t half[256];\
2042 copy_block17(full, src, 24, stride, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfHV[256];\
2061 copy_block17(full, src, 24, stride, 17);\
2062 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2069 uint8_t halfH[272];\
2070 uint8_t halfV[256];\
2071 uint8_t halfHV[256];\
2072 copy_block17(full, src, 24, stride, 17);\
2073 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079 uint8_t full[24*17];\
2080 uint8_t halfH[272];\
2081 uint8_t halfHV[256];\
2082 copy_block17(full, src, 24, stride, 17);\
2083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089 uint8_t full[24*17];\
2090 uint8_t halfH[272];\
2091 uint8_t halfV[256];\
2092 uint8_t halfHV[256];\
2093 copy_block17(full, src, 24, stride, 17);\
2094 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100 uint8_t full[24*17];\
2101 uint8_t halfH[272];\
2102 uint8_t halfHV[256];\
2103 copy_block17(full, src, 24, stride, 17);\
2104 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110 uint8_t full[24*17];\
2111 uint8_t halfH[272];\
2112 uint8_t halfV[256];\
2113 uint8_t halfHV[256];\
2114 copy_block17(full, src, 24, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2116 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121 uint8_t full[24*17];\
2122 uint8_t halfH[272];\
2123 uint8_t halfHV[256];\
2124 copy_block17(full, src, 24, stride, 17);\
2125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131 uint8_t halfH[272];\
2132 uint8_t halfHV[256];\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t halfH[272];\
2139 uint8_t halfHV[256];\
2140 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145 uint8_t full[24*17];\
2146 uint8_t halfH[272];\
2147 uint8_t halfV[256];\
2148 uint8_t halfHV[256];\
2149 copy_block17(full, src, 24, stride, 17);\
2150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156 uint8_t full[24*17];\
2157 uint8_t halfH[272];\
2158 copy_block17(full, src, 24, stride, 17);\
2159 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164 uint8_t full[24*17];\
2165 uint8_t halfH[272];\
2166 uint8_t halfV[256];\
2167 uint8_t halfHV[256];\
2168 copy_block17(full, src, 24, stride, 17);\
2169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175 uint8_t full[24*17];\
2176 uint8_t halfH[272];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183 uint8_t halfH[272];\
2184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2188 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190 #define op_put(a, b) a = cm[((b) + 16)>>5]
2191 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193 QPEL_MC(0, put_ , _ , op_put)
2194 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195 QPEL_MC(0, avg_ , _ , op_avg)
2196 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2197 #undef op_avg
2198 #undef op_avg_no_rnd
2199 #undef op_put
2200 #undef op_put_no_rnd
2202 #if 1
2203 #define H264_LOWPASS(OPNAME, OP, OP2) \
2204 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 const int h=2;\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2207 int i;\
2208 for(i=0; i<h; i++)\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2212 dst+=dstStride;\
2213 src+=srcStride;\
2217 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2218 const int w=2;\
2219 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220 int i;\
2221 for(i=0; i<w; i++)\
2223 const int srcB= src[-2*srcStride];\
2224 const int srcA= src[-1*srcStride];\
2225 const int src0= src[0 *srcStride];\
2226 const int src1= src[1 *srcStride];\
2227 const int src2= src[2 *srcStride];\
2228 const int src3= src[3 *srcStride];\
2229 const int src4= src[4 *srcStride];\
2230 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2232 dst++;\
2233 src++;\
2237 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2238 const int h=2;\
2239 const int w=2;\
2240 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2241 int i;\
2242 src -= 2*srcStride;\
2243 for(i=0; i<h+5; i++)\
2245 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2247 tmp+=tmpStride;\
2248 src+=srcStride;\
2250 tmp -= tmpStride*(h+5-2);\
2251 for(i=0; i<w; i++)\
2253 const int tmpB= tmp[-2*tmpStride];\
2254 const int tmpA= tmp[-1*tmpStride];\
2255 const int tmp0= tmp[0 *tmpStride];\
2256 const int tmp1= tmp[1 *tmpStride];\
2257 const int tmp2= tmp[2 *tmpStride];\
2258 const int tmp3= tmp[3 *tmpStride];\
2259 const int tmp4= tmp[4 *tmpStride];\
2260 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2262 dst++;\
2263 tmp++;\
2266 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2267 const int h=4;\
2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2269 int i;\
2270 for(i=0; i<h; i++)\
2272 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2276 dst+=dstStride;\
2277 src+=srcStride;\
2281 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282 const int w=4;\
2283 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284 int i;\
2285 for(i=0; i<w; i++)\
2287 const int srcB= src[-2*srcStride];\
2288 const int srcA= src[-1*srcStride];\
2289 const int src0= src[0 *srcStride];\
2290 const int src1= src[1 *srcStride];\
2291 const int src2= src[2 *srcStride];\
2292 const int src3= src[3 *srcStride];\
2293 const int src4= src[4 *srcStride];\
2294 const int src5= src[5 *srcStride];\
2295 const int src6= src[6 *srcStride];\
2296 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2300 dst++;\
2301 src++;\
2305 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2306 const int h=4;\
2307 const int w=4;\
2308 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2309 int i;\
2310 src -= 2*srcStride;\
2311 for(i=0; i<h+5; i++)\
2313 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2317 tmp+=tmpStride;\
2318 src+=srcStride;\
2320 tmp -= tmpStride*(h+5-2);\
2321 for(i=0; i<w; i++)\
2323 const int tmpB= tmp[-2*tmpStride];\
2324 const int tmpA= tmp[-1*tmpStride];\
2325 const int tmp0= tmp[0 *tmpStride];\
2326 const int tmp1= tmp[1 *tmpStride];\
2327 const int tmp2= tmp[2 *tmpStride];\
2328 const int tmp3= tmp[3 *tmpStride];\
2329 const int tmp4= tmp[4 *tmpStride];\
2330 const int tmp5= tmp[5 *tmpStride];\
2331 const int tmp6= tmp[6 *tmpStride];\
2332 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2336 dst++;\
2337 tmp++;\
2341 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342 const int h=8;\
2343 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2344 int i;\
2345 for(i=0; i<h; i++)\
2347 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2355 dst+=dstStride;\
2356 src+=srcStride;\
2360 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2361 const int w=8;\
2362 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2363 int i;\
2364 for(i=0; i<w; i++)\
2366 const int srcB= src[-2*srcStride];\
2367 const int srcA= src[-1*srcStride];\
2368 const int src0= src[0 *srcStride];\
2369 const int src1= src[1 *srcStride];\
2370 const int src2= src[2 *srcStride];\
2371 const int src3= src[3 *srcStride];\
2372 const int src4= src[4 *srcStride];\
2373 const int src5= src[5 *srcStride];\
2374 const int src6= src[6 *srcStride];\
2375 const int src7= src[7 *srcStride];\
2376 const int src8= src[8 *srcStride];\
2377 const int src9= src[9 *srcStride];\
2378 const int src10=src[10*srcStride];\
2379 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2387 dst++;\
2388 src++;\
2392 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2393 const int h=8;\
2394 const int w=8;\
2395 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2396 int i;\
2397 src -= 2*srcStride;\
2398 for(i=0; i<h+5; i++)\
2400 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2408 tmp+=tmpStride;\
2409 src+=srcStride;\
2411 tmp -= tmpStride*(h+5-2);\
2412 for(i=0; i<w; i++)\
2414 const int tmpB= tmp[-2*tmpStride];\
2415 const int tmpA= tmp[-1*tmpStride];\
2416 const int tmp0= tmp[0 *tmpStride];\
2417 const int tmp1= tmp[1 *tmpStride];\
2418 const int tmp2= tmp[2 *tmpStride];\
2419 const int tmp3= tmp[3 *tmpStride];\
2420 const int tmp4= tmp[4 *tmpStride];\
2421 const int tmp5= tmp[5 *tmpStride];\
2422 const int tmp6= tmp[6 *tmpStride];\
2423 const int tmp7= tmp[7 *tmpStride];\
2424 const int tmp8= tmp[8 *tmpStride];\
2425 const int tmp9= tmp[9 *tmpStride];\
2426 const int tmp10=tmp[10*tmpStride];\
2427 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2435 dst++;\
2436 tmp++;\
2440 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2442 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443 src += 8*srcStride;\
2444 dst += 8*dstStride;\
2445 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2446 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2449 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2451 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452 src += 8*srcStride;\
2453 dst += 8*dstStride;\
2454 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2455 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2458 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461 src += 8*srcStride;\
2462 dst += 8*dstStride;\
2463 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2467 #define H264_MC(OPNAME, SIZE) \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473 uint8_t half[SIZE*SIZE];\
2474 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483 uint8_t half[SIZE*SIZE];\
2484 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489 uint8_t full[SIZE*(SIZE+5)];\
2490 uint8_t * const full_mid= full + SIZE*2;\
2491 uint8_t half[SIZE*SIZE];\
2492 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2493 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498 uint8_t full[SIZE*(SIZE+5)];\
2499 uint8_t * const full_mid= full + SIZE*2;\
2500 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2501 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2504 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505 uint8_t full[SIZE*(SIZE+5)];\
2506 uint8_t * const full_mid= full + SIZE*2;\
2507 uint8_t half[SIZE*SIZE];\
2508 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2509 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514 uint8_t full[SIZE*(SIZE+5)];\
2515 uint8_t * const full_mid= full + SIZE*2;\
2516 uint8_t halfH[SIZE*SIZE];\
2517 uint8_t halfV[SIZE*SIZE];\
2518 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2520 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525 uint8_t full[SIZE*(SIZE+5)];\
2526 uint8_t * const full_mid= full + SIZE*2;\
2527 uint8_t halfH[SIZE*SIZE];\
2528 uint8_t halfV[SIZE*SIZE];\
2529 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2531 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2535 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536 uint8_t full[SIZE*(SIZE+5)];\
2537 uint8_t * const full_mid= full + SIZE*2;\
2538 uint8_t halfH[SIZE*SIZE];\
2539 uint8_t halfV[SIZE*SIZE];\
2540 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2542 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547 uint8_t full[SIZE*(SIZE+5)];\
2548 uint8_t * const full_mid= full + SIZE*2;\
2549 uint8_t halfH[SIZE*SIZE];\
2550 uint8_t halfV[SIZE*SIZE];\
2551 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2553 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2557 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558 int16_t tmp[SIZE*(SIZE+5)];\
2559 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563 int16_t tmp[SIZE*(SIZE+5)];\
2564 uint8_t halfH[SIZE*SIZE];\
2565 uint8_t halfHV[SIZE*SIZE];\
2566 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572 int16_t tmp[SIZE*(SIZE+5)];\
2573 uint8_t halfH[SIZE*SIZE];\
2574 uint8_t halfHV[SIZE*SIZE];\
2575 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2580 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581 uint8_t full[SIZE*(SIZE+5)];\
2582 uint8_t * const full_mid= full + SIZE*2;\
2583 int16_t tmp[SIZE*(SIZE+5)];\
2584 uint8_t halfV[SIZE*SIZE];\
2585 uint8_t halfHV[SIZE*SIZE];\
2586 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2587 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2592 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593 uint8_t full[SIZE*(SIZE+5)];\
2594 uint8_t * const full_mid= full + SIZE*2;\
2595 int16_t tmp[SIZE*(SIZE+5)];\
2596 uint8_t halfV[SIZE*SIZE];\
2597 uint8_t halfHV[SIZE*SIZE];\
2598 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2599 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2604 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606 #define op_put(a, b) a = cm[((b) + 16)>>5]
2607 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2610 H264_LOWPASS(put_ , op_put, op2_put)
2611 H264_LOWPASS(avg_ , op_avg, op2_avg)
2612 H264_MC(put_, 2)
2613 H264_MC(put_, 4)
2614 H264_MC(put_, 8)
2615 H264_MC(put_, 16)
2616 H264_MC(avg_, 4)
2617 H264_MC(avg_, 8)
2618 H264_MC(avg_, 16)
2620 #undef op_avg
2621 #undef op_put
2622 #undef op2_avg
2623 #undef op2_put
2624 #endif
2626 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628 #define H264_WEIGHT(W,H) \
2629 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2630 int y; \
2631 offset <<= log2_denom; \
2632 if(log2_denom) offset += 1<<(log2_denom-1); \
2633 for(y=0; y<H; y++, block += stride){ \
2634 op_scale1(0); \
2635 op_scale1(1); \
2636 if(W==2) continue; \
2637 op_scale1(2); \
2638 op_scale1(3); \
2639 if(W==4) continue; \
2640 op_scale1(4); \
2641 op_scale1(5); \
2642 op_scale1(6); \
2643 op_scale1(7); \
2644 if(W==8) continue; \
2645 op_scale1(8); \
2646 op_scale1(9); \
2647 op_scale1(10); \
2648 op_scale1(11); \
2649 op_scale1(12); \
2650 op_scale1(13); \
2651 op_scale1(14); \
2652 op_scale1(15); \
2655 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2656 int y; \
2657 offset = ((offset + 1) | 1) << log2_denom; \
2658 for(y=0; y<H; y++, dst += stride, src += stride){ \
2659 op_scale2(0); \
2660 op_scale2(1); \
2661 if(W==2) continue; \
2662 op_scale2(2); \
2663 op_scale2(3); \
2664 if(W==4) continue; \
2665 op_scale2(4); \
2666 op_scale2(5); \
2667 op_scale2(6); \
2668 op_scale2(7); \
2669 if(W==8) continue; \
2670 op_scale2(8); \
2671 op_scale2(9); \
2672 op_scale2(10); \
2673 op_scale2(11); \
2674 op_scale2(12); \
2675 op_scale2(13); \
2676 op_scale2(14); \
2677 op_scale2(15); \
2681 H264_WEIGHT(16,16)
2682 H264_WEIGHT(16,8)
2683 H264_WEIGHT(8,16)
2684 H264_WEIGHT(8,8)
2685 H264_WEIGHT(8,4)
2686 H264_WEIGHT(4,8)
2687 H264_WEIGHT(4,4)
2688 H264_WEIGHT(4,2)
2689 H264_WEIGHT(2,4)
2690 H264_WEIGHT(2,2)
2692 #undef op_scale1
2693 #undef op_scale2
2694 #undef H264_WEIGHT
2696 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2698 int i;
2700 for(i=0; i<h; i++){
2701 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2709 dst+=dstStride;
2710 src+=srcStride;
2714 #ifdef CONFIG_CAVS_DECODER
2715 /* AVS specific */
2716 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719 put_pixels8_c(dst, src, stride, 8);
2721 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722 avg_pixels8_c(dst, src, stride, 8);
2724 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725 put_pixels16_c(dst, src, stride, 16);
2727 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728 avg_pixels16_c(dst, src, stride, 16);
2730 #endif /* CONFIG_CAVS_DECODER */
2732 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2733 /* VC-1 specific */
2734 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737 put_pixels8_c(dst, src, stride, 8);
2739 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2743 /* H264 specific */
2744 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746 #if defined(CONFIG_RV40_DECODER)
2747 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2748 put_pixels16_xy2_c(dst, src, stride, 16);
2750 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2751 avg_pixels16_xy2_c(dst, src, stride, 16);
2753 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2754 put_pixels8_xy2_c(dst, src, stride, 8);
2756 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2757 avg_pixels8_xy2_c(dst, src, stride, 8);
2760 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2761 #endif /* CONFIG_RV40_DECODER */
2763 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2764 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2765 int i;
2767 for(i=0; i<w; i++){
2768 const int src_1= src[ -srcStride];
2769 const int src0 = src[0 ];
2770 const int src1 = src[ srcStride];
2771 const int src2 = src[2*srcStride];
2772 const int src3 = src[3*srcStride];
2773 const int src4 = src[4*srcStride];
2774 const int src5 = src[5*srcStride];
2775 const int src6 = src[6*srcStride];
2776 const int src7 = src[7*srcStride];
2777 const int src8 = src[8*srcStride];
2778 const int src9 = src[9*srcStride];
2779 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2780 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2781 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2782 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2783 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2784 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2785 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2786 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2787 src++;
2788 dst++;
2792 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2793 put_pixels8_c(dst, src, stride, 8);
2796 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2797 uint8_t half[64];
2798 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2799 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2802 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2803 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2806 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2807 uint8_t half[64];
2808 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2809 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2812 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2813 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2816 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2817 uint8_t halfH[88];
2818 uint8_t halfV[64];
2819 uint8_t halfHV[64];
2820 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2821 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2822 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2823 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2825 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2826 uint8_t halfH[88];
2827 uint8_t halfV[64];
2828 uint8_t halfHV[64];
2829 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2830 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2831 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2832 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2834 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2835 uint8_t halfH[88];
2836 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2837 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2840 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2841 if(ENABLE_ANY_H263) {
2842 int x;
2843 const int strength= ff_h263_loop_filter_strength[qscale];
2845 for(x=0; x<8; x++){
2846 int d1, d2, ad1;
2847 int p0= src[x-2*stride];
2848 int p1= src[x-1*stride];
2849 int p2= src[x+0*stride];
2850 int p3= src[x+1*stride];
2851 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2853 if (d<-2*strength) d1= 0;
2854 else if(d<- strength) d1=-2*strength - d;
2855 else if(d< strength) d1= d;
2856 else if(d< 2*strength) d1= 2*strength - d;
2857 else d1= 0;
2859 p1 += d1;
2860 p2 -= d1;
2861 if(p1&256) p1= ~(p1>>31);
2862 if(p2&256) p2= ~(p2>>31);
2864 src[x-1*stride] = p1;
2865 src[x+0*stride] = p2;
2867 ad1= FFABS(d1)>>1;
2869 d2= av_clip((p0-p3)/4, -ad1, ad1);
2871 src[x-2*stride] = p0 - d2;
2872 src[x+ stride] = p3 + d2;
2877 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2878 if(ENABLE_ANY_H263) {
2879 int y;
2880 const int strength= ff_h263_loop_filter_strength[qscale];
2882 for(y=0; y<8; y++){
2883 int d1, d2, ad1;
2884 int p0= src[y*stride-2];
2885 int p1= src[y*stride-1];
2886 int p2= src[y*stride+0];
2887 int p3= src[y*stride+1];
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2890 if (d<-2*strength) d1= 0;
2891 else if(d<- strength) d1=-2*strength - d;
2892 else if(d< strength) d1= d;
2893 else if(d< 2*strength) d1= 2*strength - d;
2894 else d1= 0;
2896 p1 += d1;
2897 p2 -= d1;
2898 if(p1&256) p1= ~(p1>>31);
2899 if(p2&256) p2= ~(p2>>31);
2901 src[y*stride-1] = p1;
2902 src[y*stride+0] = p2;
2904 ad1= FFABS(d1)>>1;
2906 d2= av_clip((p0-p3)/4, -ad1, ad1);
2908 src[y*stride-2] = p0 - d2;
2909 src[y*stride+1] = p3 + d2;
2914 static void h261_loop_filter_c(uint8_t *src, int stride){
2915 int x,y,xy,yz;
2916 int temp[64];
2918 for(x=0; x<8; x++){
2919 temp[x ] = 4*src[x ];
2920 temp[x + 7*8] = 4*src[x + 7*stride];
2922 for(y=1; y<7; y++){
2923 for(x=0; x<8; x++){
2924 xy = y * stride + x;
2925 yz = y * 8 + x;
2926 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2930 for(y=0; y<8; y++){
2931 src[ y*stride] = (temp[ y*8] + 2)>>2;
2932 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2933 for(x=1; x<7; x++){
2934 xy = y * stride + x;
2935 yz = y * 8 + x;
2936 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2941 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2943 int i, d;
2944 for( i = 0; i < 4; i++ ) {
2945 if( tc0[i] < 0 ) {
2946 pix += 4*ystride;
2947 continue;
2949 for( d = 0; d < 4; d++ ) {
2950 const int p0 = pix[-1*xstride];
2951 const int p1 = pix[-2*xstride];
2952 const int p2 = pix[-3*xstride];
2953 const int q0 = pix[0];
2954 const int q1 = pix[1*xstride];
2955 const int q2 = pix[2*xstride];
2957 if( FFABS( p0 - q0 ) < alpha &&
2958 FFABS( p1 - p0 ) < beta &&
2959 FFABS( q1 - q0 ) < beta ) {
2961 int tc = tc0[i];
2962 int i_delta;
2964 if( FFABS( p2 - p0 ) < beta ) {
2965 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2966 tc++;
2968 if( FFABS( q2 - q0 ) < beta ) {
2969 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2970 tc++;
2973 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2974 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2975 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2977 pix += ystride;
2981 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2983 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2985 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2990 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2992 int i, d;
2993 for( i = 0; i < 4; i++ ) {
2994 const int tc = tc0[i];
2995 if( tc <= 0 ) {
2996 pix += 2*ystride;
2997 continue;
2999 for( d = 0; d < 2; d++ ) {
3000 const int p0 = pix[-1*xstride];
3001 const int p1 = pix[-2*xstride];
3002 const int q0 = pix[0];
3003 const int q1 = pix[1*xstride];
3005 if( FFABS( p0 - q0 ) < alpha &&
3006 FFABS( p1 - p0 ) < beta &&
3007 FFABS( q1 - q0 ) < beta ) {
3009 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3011 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3012 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3014 pix += ystride;
3018 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3020 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3022 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3024 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3027 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3029 int d;
3030 for( d = 0; d < 8; d++ ) {
3031 const int p0 = pix[-1*xstride];
3032 const int p1 = pix[-2*xstride];
3033 const int q0 = pix[0];
3034 const int q1 = pix[1*xstride];
3036 if( FFABS( p0 - q0 ) < alpha &&
3037 FFABS( p1 - p0 ) < beta &&
3038 FFABS( q1 - q0 ) < beta ) {
3040 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3041 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3043 pix += ystride;
3046 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3048 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3050 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3052 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3055 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3057 int s, i;
3059 s = 0;
3060 for(i=0;i<h;i++) {
3061 s += abs(pix1[0] - pix2[0]);
3062 s += abs(pix1[1] - pix2[1]);
3063 s += abs(pix1[2] - pix2[2]);
3064 s += abs(pix1[3] - pix2[3]);
3065 s += abs(pix1[4] - pix2[4]);
3066 s += abs(pix1[5] - pix2[5]);
3067 s += abs(pix1[6] - pix2[6]);
3068 s += abs(pix1[7] - pix2[7]);
3069 s += abs(pix1[8] - pix2[8]);
3070 s += abs(pix1[9] - pix2[9]);
3071 s += abs(pix1[10] - pix2[10]);
3072 s += abs(pix1[11] - pix2[11]);
3073 s += abs(pix1[12] - pix2[12]);
3074 s += abs(pix1[13] - pix2[13]);
3075 s += abs(pix1[14] - pix2[14]);
3076 s += abs(pix1[15] - pix2[15]);
3077 pix1 += line_size;
3078 pix2 += line_size;
3080 return s;
3083 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3085 int s, i;
3087 s = 0;
3088 for(i=0;i<h;i++) {
3089 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3090 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3091 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3092 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3093 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3094 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3095 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3096 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3097 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3098 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3099 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3100 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3101 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3102 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3103 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3104 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3105 pix1 += line_size;
3106 pix2 += line_size;
3108 return s;
3111 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3113 int s, i;
3114 uint8_t *pix3 = pix2 + line_size;
3116 s = 0;
3117 for(i=0;i<h;i++) {
3118 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3119 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3120 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3121 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3122 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3123 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3124 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3125 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3126 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3127 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3128 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3129 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3130 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3131 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3132 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3133 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3134 pix1 += line_size;
3135 pix2 += line_size;
3136 pix3 += line_size;
3138 return s;
3141 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3143 int s, i;
3144 uint8_t *pix3 = pix2 + line_size;
3146 s = 0;
3147 for(i=0;i<h;i++) {
3148 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3149 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3150 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3151 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3152 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3153 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3154 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3155 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3156 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3157 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3158 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3159 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3160 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3161 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3162 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3163 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3164 pix1 += line_size;
3165 pix2 += line_size;
3166 pix3 += line_size;
3168 return s;
3171 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3173 int s, i;
3175 s = 0;
3176 for(i=0;i<h;i++) {
3177 s += abs(pix1[0] - pix2[0]);
3178 s += abs(pix1[1] - pix2[1]);
3179 s += abs(pix1[2] - pix2[2]);
3180 s += abs(pix1[3] - pix2[3]);
3181 s += abs(pix1[4] - pix2[4]);
3182 s += abs(pix1[5] - pix2[5]);
3183 s += abs(pix1[6] - pix2[6]);
3184 s += abs(pix1[7] - pix2[7]);
3185 pix1 += line_size;
3186 pix2 += line_size;
3188 return s;
3191 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3193 int s, i;
3195 s = 0;
3196 for(i=0;i<h;i++) {
3197 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3198 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3199 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3200 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3201 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3202 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3203 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3204 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3205 pix1 += line_size;
3206 pix2 += line_size;
3208 return s;
3211 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3213 int s, i;
3214 uint8_t *pix3 = pix2 + line_size;
3216 s = 0;
3217 for(i=0;i<h;i++) {
3218 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3219 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3220 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3221 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3222 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3223 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3224 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3225 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3226 pix1 += line_size;
3227 pix2 += line_size;
3228 pix3 += line_size;
3230 return s;
3233 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3235 int s, i;
3236 uint8_t *pix3 = pix2 + line_size;
3238 s = 0;
3239 for(i=0;i<h;i++) {
3240 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3241 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3242 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3243 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3244 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3245 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3246 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3247 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3248 pix1 += line_size;
3249 pix2 += line_size;
3250 pix3 += line_size;
3252 return s;
3255 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3256 MpegEncContext *c = v;
3257 int score1=0;
3258 int score2=0;
3259 int x,y;
3261 for(y=0; y<h; y++){
3262 for(x=0; x<16; x++){
3263 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3265 if(y+1<h){
3266 for(x=0; x<15; x++){
3267 score2+= FFABS( s1[x ] - s1[x +stride]
3268 - s1[x+1] + s1[x+1+stride])
3269 -FFABS( s2[x ] - s2[x +stride]
3270 - s2[x+1] + s2[x+1+stride]);
3273 s1+= stride;
3274 s2+= stride;
3277 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3278 else return score1 + FFABS(score2)*8;
3281 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3282 MpegEncContext *c = v;
3283 int score1=0;
3284 int score2=0;
3285 int x,y;
3287 for(y=0; y<h; y++){
3288 for(x=0; x<8; x++){
3289 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3291 if(y+1<h){
3292 for(x=0; x<7; x++){
3293 score2+= FFABS( s1[x ] - s1[x +stride]
3294 - s1[x+1] + s1[x+1+stride])
3295 -FFABS( s2[x ] - s2[x +stride]
3296 - s2[x+1] + s2[x+1+stride]);
3299 s1+= stride;
3300 s2+= stride;
3303 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3304 else return score1 + FFABS(score2)*8;
3307 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3308 int i;
3309 unsigned int sum=0;
3311 for(i=0; i<8*8; i++){
3312 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3313 int w= weight[i];
3314 b>>= RECON_SHIFT;
3315 assert(-512<b && b<512);
3317 sum += (w*b)*(w*b)>>4;
3319 return sum>>2;
3322 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3323 int i;
3325 for(i=0; i<8*8; i++){
3326 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3331 * permutes an 8x8 block.
3332 * @param block the block which will be permuted according to the given permutation vector
3333 * @param permutation the permutation vector
3334 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3335 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3336 * (inverse) permutated to scantable order!
3338 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3340 int i;
3341 DCTELEM temp[64];
3343 if(last<=0) return;
3344 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3346 for(i=0; i<=last; i++){
3347 const int j= scantable[i];
3348 temp[j]= block[j];
3349 block[j]=0;
3352 for(i=0; i<=last; i++){
3353 const int j= scantable[i];
3354 const int perm_j= permutation[j];
3355 block[perm_j]= temp[j];
3359 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3360 return 0;
3363 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3364 int i;
3366 memset(cmp, 0, sizeof(void*)*5);
3368 for(i=0; i<5; i++){
3369 switch(type&0xFF){
3370 case FF_CMP_SAD:
3371 cmp[i]= c->sad[i];
3372 break;
3373 case FF_CMP_SATD:
3374 cmp[i]= c->hadamard8_diff[i];
3375 break;
3376 case FF_CMP_SSE:
3377 cmp[i]= c->sse[i];
3378 break;
3379 case FF_CMP_DCT:
3380 cmp[i]= c->dct_sad[i];
3381 break;
3382 case FF_CMP_DCT264:
3383 cmp[i]= c->dct264_sad[i];
3384 break;
3385 case FF_CMP_DCTMAX:
3386 cmp[i]= c->dct_max[i];
3387 break;
3388 case FF_CMP_PSNR:
3389 cmp[i]= c->quant_psnr[i];
3390 break;
3391 case FF_CMP_BIT:
3392 cmp[i]= c->bit[i];
3393 break;
3394 case FF_CMP_RD:
3395 cmp[i]= c->rd[i];
3396 break;
3397 case FF_CMP_VSAD:
3398 cmp[i]= c->vsad[i];
3399 break;
3400 case FF_CMP_VSSE:
3401 cmp[i]= c->vsse[i];
3402 break;
3403 case FF_CMP_ZERO:
3404 cmp[i]= zero_cmp;
3405 break;
3406 case FF_CMP_NSSE:
3407 cmp[i]= c->nsse[i];
3408 break;
3409 #ifdef CONFIG_SNOW_ENCODER
3410 case FF_CMP_W53:
3411 cmp[i]= c->w53[i];
3412 break;
3413 case FF_CMP_W97:
3414 cmp[i]= c->w97[i];
3415 break;
3416 #endif
3417 default:
3418 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3423 static void clear_block_c(DCTELEM *block)
3425 memset(block, 0, sizeof(DCTELEM)*64);
3429 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3431 static void clear_blocks_c(DCTELEM *blocks)
3433 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3436 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3437 long i;
3438 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3439 long a = *(long*)(src+i);
3440 long b = *(long*)(dst+i);
3441 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3443 for(; i<w; i++)
3444 dst[i+0] += src[i+0];
3447 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3448 long i;
3449 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450 long a = *(long*)(src1+i);
3451 long b = *(long*)(src2+i);
3452 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3454 for(; i<w; i++)
3455 dst[i] = src1[i]+src2[i];
3458 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3459 long i;
3460 #ifndef HAVE_FAST_UNALIGNED
3461 if((long)src2 & (sizeof(long)-1)){
3462 for(i=0; i+7<w; i+=8){
3463 dst[i+0] = src1[i+0]-src2[i+0];
3464 dst[i+1] = src1[i+1]-src2[i+1];
3465 dst[i+2] = src1[i+2]-src2[i+2];
3466 dst[i+3] = src1[i+3]-src2[i+3];
3467 dst[i+4] = src1[i+4]-src2[i+4];
3468 dst[i+5] = src1[i+5]-src2[i+5];
3469 dst[i+6] = src1[i+6]-src2[i+6];
3470 dst[i+7] = src1[i+7]-src2[i+7];
3472 }else
3473 #endif
3474 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3475 long a = *(long*)(src1+i);
3476 long b = *(long*)(src2+i);
3477 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3479 for(; i<w; i++)
3480 dst[i+0] = src1[i+0]-src2[i+0];
3483 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3484 int i;
3485 uint8_t l, lt;
3487 l= *left;
3488 lt= *left_top;
3490 for(i=0; i<w; i++){
3491 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3492 lt= src1[i];
3493 l= src2[i];
3494 dst[i]= l - pred;
3497 *left= l;
3498 *left_top= lt;
3501 #define BUTTERFLY2(o1,o2,i1,i2) \
3502 o1= (i1)+(i2);\
3503 o2= (i1)-(i2);
3505 #define BUTTERFLY1(x,y) \
3507 int a,b;\
3508 a= x;\
3509 b= y;\
3510 x= a+b;\
3511 y= a-b;\
3514 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3516 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3517 int i;
3518 int temp[64];
3519 int sum=0;
3521 assert(h==8);
3523 for(i=0; i<8; i++){
3524 //FIXME try pointer walks
3525 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3526 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3527 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3528 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3530 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3531 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3532 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3533 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3535 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3536 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3537 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3538 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3541 for(i=0; i<8; i++){
3542 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3543 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3544 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3545 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3547 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3548 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3549 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3550 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3552 sum +=
3553 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3554 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3555 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3556 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3558 #if 0
3559 static int maxi=0;
3560 if(sum>maxi){
3561 maxi=sum;
3562 printf("MAX:%d\n", maxi);
3564 #endif
3565 return sum;
3568 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3569 int i;
3570 int temp[64];
3571 int sum=0;
3573 assert(h==8);
3575 for(i=0; i<8; i++){
3576 //FIXME try pointer walks
3577 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3578 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3579 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3580 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3582 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3583 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3584 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3585 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3587 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3588 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3589 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3590 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3593 for(i=0; i<8; i++){
3594 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3595 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3596 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3597 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3599 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3600 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3601 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3602 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3604 sum +=
3605 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3606 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3607 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3608 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3611 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3613 return sum;
3616 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3617 MpegEncContext * const s= (MpegEncContext *)c;
3618 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3619 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3621 assert(h==8);
3623 s->dsp.diff_pixels(temp, src1, src2, stride);
3624 s->dsp.fdct(temp);
3625 return s->dsp.sum_abs_dctelem(temp);
3628 #ifdef CONFIG_GPL
3629 #define DCT8_1D {\
3630 const int s07 = SRC(0) + SRC(7);\
3631 const int s16 = SRC(1) + SRC(6);\
3632 const int s25 = SRC(2) + SRC(5);\
3633 const int s34 = SRC(3) + SRC(4);\
3634 const int a0 = s07 + s34;\
3635 const int a1 = s16 + s25;\
3636 const int a2 = s07 - s34;\
3637 const int a3 = s16 - s25;\
3638 const int d07 = SRC(0) - SRC(7);\
3639 const int d16 = SRC(1) - SRC(6);\
3640 const int d25 = SRC(2) - SRC(5);\
3641 const int d34 = SRC(3) - SRC(4);\
3642 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3643 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3644 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3645 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3646 DST(0, a0 + a1 ) ;\
3647 DST(1, a4 + (a7>>2)) ;\
3648 DST(2, a2 + (a3>>1)) ;\
3649 DST(3, a5 + (a6>>2)) ;\
3650 DST(4, a0 - a1 ) ;\
3651 DST(5, a6 - (a5>>2)) ;\
3652 DST(6, (a2>>1) - a3 ) ;\
3653 DST(7, (a4>>2) - a7 ) ;\
3656 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657 MpegEncContext * const s= (MpegEncContext *)c;
3658 DCTELEM dct[8][8];
3659 int i;
3660 int sum=0;
3662 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3664 #define SRC(x) dct[i][x]
3665 #define DST(x,v) dct[i][x]= v
3666 for( i = 0; i < 8; i++ )
3667 DCT8_1D
3668 #undef SRC
3669 #undef DST
3671 #define SRC(x) dct[x][i]
3672 #define DST(x,v) sum += FFABS(v)
3673 for( i = 0; i < 8; i++ )
3674 DCT8_1D
3675 #undef SRC
3676 #undef DST
3677 return sum;
3679 #endif
3681 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3682 MpegEncContext * const s= (MpegEncContext *)c;
3683 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3684 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3685 int sum=0, i;
3687 assert(h==8);
3689 s->dsp.diff_pixels(temp, src1, src2, stride);
3690 s->dsp.fdct(temp);
3692 for(i=0; i<64; i++)
3693 sum= FFMAX(sum, FFABS(temp[i]));
3695 return sum;
3698 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3699 MpegEncContext * const s= (MpegEncContext *)c;
3700 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3701 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3702 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3703 int sum=0, i;
3705 assert(h==8);
3706 s->mb_intra=0;
3708 s->dsp.diff_pixels(temp, src1, src2, stride);
3710 memcpy(bak, temp, 64*sizeof(DCTELEM));
3712 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3713 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3714 ff_simple_idct(temp); //FIXME
3716 for(i=0; i<64; i++)
3717 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3719 return sum;
3722 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3723 MpegEncContext * const s= (MpegEncContext *)c;
3724 const uint8_t *scantable= s->intra_scantable.permutated;
3725 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3726 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3727 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3728 uint8_t * const bak= (uint8_t*)aligned_bak;
3729 int i, last, run, bits, level, distortion, start_i;
3730 const int esc_length= s->ac_esc_length;
3731 uint8_t * length;
3732 uint8_t * last_length;
3734 assert(h==8);
3736 for(i=0; i<8; i++){
3737 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3738 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3741 s->dsp.diff_pixels(temp, src1, src2, stride);
3743 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3745 bits=0;
3747 if (s->mb_intra) {
3748 start_i = 1;
3749 length = s->intra_ac_vlc_length;
3750 last_length= s->intra_ac_vlc_last_length;
3751 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3752 } else {
3753 start_i = 0;
3754 length = s->inter_ac_vlc_length;
3755 last_length= s->inter_ac_vlc_last_length;
3758 if(last>=start_i){
3759 run=0;
3760 for(i=start_i; i<last; i++){
3761 int j= scantable[i];
3762 level= temp[j];
3764 if(level){
3765 level+=64;
3766 if((level&(~127)) == 0){
3767 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3768 }else
3769 bits+= esc_length;
3770 run=0;
3771 }else
3772 run++;
3774 i= scantable[last];
3776 level= temp[i] + 64;
3778 assert(level - 64);
3780 if((level&(~127)) == 0){
3781 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3782 }else
3783 bits+= esc_length;
3787 if(last>=0){
3788 if(s->mb_intra)
3789 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3790 else
3791 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3794 s->dsp.idct_add(bak, stride, temp);
3796 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3798 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3801 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802 MpegEncContext * const s= (MpegEncContext *)c;
3803 const uint8_t *scantable= s->intra_scantable.permutated;
3804 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3806 int i, last, run, bits, level, start_i;
3807 const int esc_length= s->ac_esc_length;
3808 uint8_t * length;
3809 uint8_t * last_length;
3811 assert(h==8);
3813 s->dsp.diff_pixels(temp, src1, src2, stride);
3815 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3817 bits=0;
3819 if (s->mb_intra) {
3820 start_i = 1;
3821 length = s->intra_ac_vlc_length;
3822 last_length= s->intra_ac_vlc_last_length;
3823 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3824 } else {
3825 start_i = 0;
3826 length = s->inter_ac_vlc_length;
3827 last_length= s->inter_ac_vlc_last_length;
3830 if(last>=start_i){
3831 run=0;
3832 for(i=start_i; i<last; i++){
3833 int j= scantable[i];
3834 level= temp[j];
3836 if(level){
3837 level+=64;
3838 if((level&(~127)) == 0){
3839 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3840 }else
3841 bits+= esc_length;
3842 run=0;
3843 }else
3844 run++;
3846 i= scantable[last];
3848 level= temp[i] + 64;
3850 assert(level - 64);
3852 if((level&(~127)) == 0){
3853 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3854 }else
3855 bits+= esc_length;
3858 return bits;
3861 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3862 int score=0;
3863 int x,y;
3865 for(y=1; y<h; y++){
3866 for(x=0; x<16; x+=4){
3867 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3868 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3870 s+= stride;
3873 return score;
3876 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3877 int score=0;
3878 int x,y;
3880 for(y=1; y<h; y++){
3881 for(x=0; x<16; x++){
3882 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3884 s1+= stride;
3885 s2+= stride;
3888 return score;
3891 #define SQ(a) ((a)*(a))
3892 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3893 int score=0;
3894 int x,y;
3896 for(y=1; y<h; y++){
3897 for(x=0; x<16; x+=4){
3898 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3899 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3901 s+= stride;
3904 return score;
3907 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3908 int score=0;
3909 int x,y;
3911 for(y=1; y<h; y++){
3912 for(x=0; x<16; x++){
3913 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3915 s1+= stride;
3916 s2+= stride;
3919 return score;
3922 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3923 int size){
3924 int score=0;
3925 int i;
3926 for(i=0; i<size; i++)
3927 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3928 return score;
3931 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3932 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3933 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3934 #ifdef CONFIG_GPL
3935 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3936 #endif
3937 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3938 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3939 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3940 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3942 static void vector_fmul_c(float *dst, const float *src, int len){
3943 int i;
3944 for(i=0; i<len; i++)
3945 dst[i] *= src[i];
3948 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3949 int i;
3950 src1 += len-1;
3951 for(i=0; i<len; i++)
3952 dst[i] = src0[i] * src1[-i];
3955 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3956 int i;
3957 for(i=0; i<len; i++)
3958 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3961 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3962 int i,j;
3963 dst += len;
3964 win += len;
3965 src0+= len;
3966 for(i=-len, j=len-1; i<0; i++, j--) {
3967 float s0 = src0[i];
3968 float s1 = src1[j];
3969 float wi = win[i];
3970 float wj = win[j];
3971 dst[i] = s0*wj - s1*wi + add_bias;
3972 dst[j] = s0*wi + s1*wj + add_bias;
3976 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3977 int i;
3978 for(i=0; i<len; i++)
3979 dst[i] = src[i] * mul;
3982 static av_always_inline int float_to_int16_one(const float *src){
3983 int_fast32_t tmp = *(const int32_t*)src;
3984 if(tmp & 0xf0000){
3985 tmp = (0x43c0ffff - tmp)>>31;
3986 // is this faster on some gcc/cpu combinations?
3987 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3988 // else tmp = 0;
3990 return tmp - 0x8000;
3993 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3994 int i;
3995 for(i=0; i<len; i++)
3996 dst[i] = float_to_int16_one(src+i);
3999 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4000 int i,j,c;
4001 if(channels==2){
4002 for(i=0; i<len; i++){
4003 dst[2*i] = float_to_int16_one(src[0]+i);
4004 dst[2*i+1] = float_to_int16_one(src[1]+i);
4006 }else{
4007 for(c=0; c<channels; c++)
4008 for(i=0, j=c; i<len; i++, j+=channels)
4009 dst[j] = float_to_int16_one(src[c]+i);
4013 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4015 while (order--)
4016 *v1++ += *v2++;
4019 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4021 while (order--)
4022 *v1++ -= *v2++;
4025 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4027 int res = 0;
4029 while (order--)
4030 res += (*v1++ * *v2++) >> shift;
4032 return res;
4035 #define W0 2048
4036 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4037 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4038 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4039 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4040 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4041 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4042 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4044 static void wmv2_idct_row(short * b)
4046 int s1,s2;
4047 int a0,a1,a2,a3,a4,a5,a6,a7;
4048 /*step 1*/
4049 a1 = W1*b[1]+W7*b[7];
4050 a7 = W7*b[1]-W1*b[7];
4051 a5 = W5*b[5]+W3*b[3];
4052 a3 = W3*b[5]-W5*b[3];
4053 a2 = W2*b[2]+W6*b[6];
4054 a6 = W6*b[2]-W2*b[6];
4055 a0 = W0*b[0]+W0*b[4];
4056 a4 = W0*b[0]-W0*b[4];
4057 /*step 2*/
4058 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4059 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4060 /*step 3*/
4061 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4062 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4063 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4064 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4065 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4066 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4067 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4068 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4070 static void wmv2_idct_col(short * b)
4072 int s1,s2;
4073 int a0,a1,a2,a3,a4,a5,a6,a7;
4074 /*step 1, with extended precision*/
4075 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4076 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4077 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4078 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4079 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4080 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4081 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4082 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4083 /*step 2*/
4084 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4085 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4086 /*step 3*/
4087 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4088 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4089 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4090 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4092 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4093 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4094 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4095 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4097 void ff_wmv2_idct_c(short * block){
4098 int i;
4100 for(i=0;i<64;i+=8){
4101 wmv2_idct_row(block+i);
4103 for(i=0;i<8;i++){
4104 wmv2_idct_col(block+i);
4107 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4108 converted */
4109 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4111 ff_wmv2_idct_c(block);
4112 put_pixels_clamped_c(block, dest, line_size);
4114 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4116 ff_wmv2_idct_c(block);
4117 add_pixels_clamped_c(block, dest, line_size);
4119 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4121 j_rev_dct (block);
4122 put_pixels_clamped_c(block, dest, line_size);
4124 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4126 j_rev_dct (block);
4127 add_pixels_clamped_c(block, dest, line_size);
4130 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4132 j_rev_dct4 (block);
4133 put_pixels_clamped4_c(block, dest, line_size);
4135 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4137 j_rev_dct4 (block);
4138 add_pixels_clamped4_c(block, dest, line_size);
4141 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4143 j_rev_dct2 (block);
4144 put_pixels_clamped2_c(block, dest, line_size);
4146 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4148 j_rev_dct2 (block);
4149 add_pixels_clamped2_c(block, dest, line_size);
4152 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4154 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4156 dest[0] = cm[(block[0] + 4)>>3];
4158 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4160 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4162 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4165 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4167 /* init static data */
4168 void dsputil_static_init(void)
4170 int i;
4172 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4173 for(i=0;i<MAX_NEG_CROP;i++) {
4174 ff_cropTbl[i] = 0;
4175 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4178 for(i=0;i<512;i++) {
4179 ff_squareTbl[i] = (i - 256) * (i - 256);
4182 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4185 int ff_check_alignment(void){
4186 static int did_fail=0;
4187 DECLARE_ALIGNED_16(int, aligned);
4189 if((long)&aligned & 15){
4190 if(!did_fail){
4191 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4192 av_log(NULL, AV_LOG_ERROR,
4193 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4194 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4195 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4196 "Do not report crashes to FFmpeg developers.\n");
4197 #endif
4198 did_fail=1;
4200 return -1;
4202 return 0;
4205 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4207 int i;
4209 ff_check_alignment();
4211 #ifdef CONFIG_ENCODERS
4212 if(avctx->dct_algo==FF_DCT_FASTINT) {
4213 c->fdct = fdct_ifast;
4214 c->fdct248 = fdct_ifast248;
4216 else if(avctx->dct_algo==FF_DCT_FAAN) {
4217 c->fdct = ff_faandct;
4218 c->fdct248 = ff_faandct248;
4220 else {
4221 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4222 c->fdct248 = ff_fdct248_islow;
4224 #endif //CONFIG_ENCODERS
4226 if(avctx->lowres==1){
4227 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4228 c->idct_put= ff_jref_idct4_put;
4229 c->idct_add= ff_jref_idct4_add;
4230 }else{
4231 c->idct_put= ff_h264_lowres_idct_put_c;
4232 c->idct_add= ff_h264_lowres_idct_add_c;
4234 c->idct = j_rev_dct4;
4235 c->idct_permutation_type= FF_NO_IDCT_PERM;
4236 }else if(avctx->lowres==2){
4237 c->idct_put= ff_jref_idct2_put;
4238 c->idct_add= ff_jref_idct2_add;
4239 c->idct = j_rev_dct2;
4240 c->idct_permutation_type= FF_NO_IDCT_PERM;
4241 }else if(avctx->lowres==3){
4242 c->idct_put= ff_jref_idct1_put;
4243 c->idct_add= ff_jref_idct1_add;
4244 c->idct = j_rev_dct1;
4245 c->idct_permutation_type= FF_NO_IDCT_PERM;
4246 }else{
4247 if(avctx->idct_algo==FF_IDCT_INT){
4248 c->idct_put= ff_jref_idct_put;
4249 c->idct_add= ff_jref_idct_add;
4250 c->idct = j_rev_dct;
4251 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4252 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4253 avctx->idct_algo==FF_IDCT_VP3){
4254 c->idct_put= ff_vp3_idct_put_c;
4255 c->idct_add= ff_vp3_idct_add_c;
4256 c->idct = ff_vp3_idct_c;
4257 c->idct_permutation_type= FF_NO_IDCT_PERM;
4258 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4259 c->idct_put= ff_wmv2_idct_put_c;
4260 c->idct_add= ff_wmv2_idct_add_c;
4261 c->idct = ff_wmv2_idct_c;
4262 c->idct_permutation_type= FF_NO_IDCT_PERM;
4263 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4264 c->idct_put= ff_faanidct_put;
4265 c->idct_add= ff_faanidct_add;
4266 c->idct = ff_faanidct;
4267 c->idct_permutation_type= FF_NO_IDCT_PERM;
4268 }else if(ENABLE_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4269 c->idct_put= ff_ea_idct_put_c;
4270 c->idct_permutation_type= FF_NO_IDCT_PERM;
4271 }else{ //accurate/default
4272 c->idct_put= ff_simple_idct_put;
4273 c->idct_add= ff_simple_idct_add;
4274 c->idct = ff_simple_idct;
4275 c->idct_permutation_type= FF_NO_IDCT_PERM;
4279 if (ENABLE_H264_DECODER) {
4280 c->h264_idct_add= ff_h264_idct_add_c;
4281 c->h264_idct8_add= ff_h264_idct8_add_c;
4282 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4283 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4286 c->get_pixels = get_pixels_c;
4287 c->diff_pixels = diff_pixels_c;
4288 c->put_pixels_clamped = put_pixels_clamped_c;
4289 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4290 c->add_pixels_clamped = add_pixels_clamped_c;
4291 c->add_pixels8 = add_pixels8_c;
4292 c->add_pixels4 = add_pixels4_c;
4293 c->sum_abs_dctelem = sum_abs_dctelem_c;
4294 c->gmc1 = gmc1_c;
4295 c->gmc = ff_gmc_c;
4296 c->clear_block = clear_block_c;
4297 c->clear_blocks = clear_blocks_c;
4298 c->pix_sum = pix_sum_c;
4299 c->pix_norm1 = pix_norm1_c;
4301 /* TODO [0] 16 [1] 8 */
4302 c->pix_abs[0][0] = pix_abs16_c;
4303 c->pix_abs[0][1] = pix_abs16_x2_c;
4304 c->pix_abs[0][2] = pix_abs16_y2_c;
4305 c->pix_abs[0][3] = pix_abs16_xy2_c;
4306 c->pix_abs[1][0] = pix_abs8_c;
4307 c->pix_abs[1][1] = pix_abs8_x2_c;
4308 c->pix_abs[1][2] = pix_abs8_y2_c;
4309 c->pix_abs[1][3] = pix_abs8_xy2_c;
4311 #define dspfunc(PFX, IDX, NUM) \
4312 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4313 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4314 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4315 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4317 dspfunc(put, 0, 16);
4318 dspfunc(put_no_rnd, 0, 16);
4319 dspfunc(put, 1, 8);
4320 dspfunc(put_no_rnd, 1, 8);
4321 dspfunc(put, 2, 4);
4322 dspfunc(put, 3, 2);
4324 dspfunc(avg, 0, 16);
4325 dspfunc(avg_no_rnd, 0, 16);
4326 dspfunc(avg, 1, 8);
4327 dspfunc(avg_no_rnd, 1, 8);
4328 dspfunc(avg, 2, 4);
4329 dspfunc(avg, 3, 2);
4330 #undef dspfunc
4332 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4333 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4335 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4336 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4337 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4338 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4339 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4340 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4341 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4342 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4343 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4345 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4346 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4347 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4348 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4349 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4350 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4351 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4352 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4353 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4355 #define dspfunc(PFX, IDX, NUM) \
4356 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4357 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4358 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4359 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4360 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4361 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4362 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4363 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4364 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4365 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4366 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4367 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4368 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4369 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4370 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4371 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4373 dspfunc(put_qpel, 0, 16);
4374 dspfunc(put_no_rnd_qpel, 0, 16);
4376 dspfunc(avg_qpel, 0, 16);
4377 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4379 dspfunc(put_qpel, 1, 8);
4380 dspfunc(put_no_rnd_qpel, 1, 8);
4382 dspfunc(avg_qpel, 1, 8);
4383 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4385 dspfunc(put_h264_qpel, 0, 16);
4386 dspfunc(put_h264_qpel, 1, 8);
4387 dspfunc(put_h264_qpel, 2, 4);
4388 dspfunc(put_h264_qpel, 3, 2);
4389 dspfunc(avg_h264_qpel, 0, 16);
4390 dspfunc(avg_h264_qpel, 1, 8);
4391 dspfunc(avg_h264_qpel, 2, 4);
4393 #undef dspfunc
4394 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4395 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4396 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4397 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4398 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4399 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4400 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4402 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4403 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4404 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4405 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4406 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4407 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4408 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4409 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4410 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4411 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4412 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4413 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4414 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4415 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4416 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4417 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4418 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4419 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4420 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4421 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4423 c->draw_edges = draw_edges_c;
4425 #ifdef CONFIG_CAVS_DECODER
4426 ff_cavsdsp_init(c,avctx);
4427 #endif
4428 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4429 ff_vc1dsp_init(c,avctx);
4430 #endif
4431 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4432 ff_intrax8dsp_init(c,avctx);
4433 #endif
4434 #if defined(CONFIG_H264_ENCODER)
4435 ff_h264dspenc_init(c,avctx);
4436 #endif
4437 #if defined(CONFIG_RV40_DECODER)
4438 ff_rv40dsp_init(c,avctx);
4439 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4440 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4441 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4442 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4443 #endif
4445 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4446 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4447 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4448 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4449 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4450 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4451 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4452 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4454 #define SET_CMP_FUNC(name) \
4455 c->name[0]= name ## 16_c;\
4456 c->name[1]= name ## 8x8_c;
4458 SET_CMP_FUNC(hadamard8_diff)
4459 c->hadamard8_diff[4]= hadamard8_intra16_c;
4460 SET_CMP_FUNC(dct_sad)
4461 SET_CMP_FUNC(dct_max)
4462 #ifdef CONFIG_GPL
4463 SET_CMP_FUNC(dct264_sad)
4464 #endif
4465 c->sad[0]= pix_abs16_c;
4466 c->sad[1]= pix_abs8_c;
4467 c->sse[0]= sse16_c;
4468 c->sse[1]= sse8_c;
4469 c->sse[2]= sse4_c;
4470 SET_CMP_FUNC(quant_psnr)
4471 SET_CMP_FUNC(rd)
4472 SET_CMP_FUNC(bit)
4473 c->vsad[0]= vsad16_c;
4474 c->vsad[4]= vsad_intra16_c;
4475 c->vsse[0]= vsse16_c;
4476 c->vsse[4]= vsse_intra16_c;
4477 c->nsse[0]= nsse16_c;
4478 c->nsse[1]= nsse8_c;
4479 #ifdef CONFIG_SNOW_ENCODER
4480 c->w53[0]= w53_16_c;
4481 c->w53[1]= w53_8_c;
4482 c->w97[0]= w97_16_c;
4483 c->w97[1]= w97_8_c;
4484 #endif
4486 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4488 c->add_bytes= add_bytes_c;
4489 c->add_bytes_l2= add_bytes_l2_c;
4490 c->diff_bytes= diff_bytes_c;
4491 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4492 c->bswap_buf= bswap_buf;
4493 #ifdef CONFIG_PNG_DECODER
4494 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4495 #endif
4497 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4498 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4499 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4500 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4501 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4502 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4503 c->h264_loop_filter_strength= NULL;
4505 if (ENABLE_ANY_H263) {
4506 c->h263_h_loop_filter= h263_h_loop_filter_c;
4507 c->h263_v_loop_filter= h263_v_loop_filter_c;
4510 if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4511 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4512 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4515 c->h261_loop_filter= h261_loop_filter_c;
4517 c->try_8x8basis= try_8x8basis_c;
4518 c->add_8x8basis= add_8x8basis_c;
4520 #ifdef CONFIG_SNOW_DECODER
4521 c->vertical_compose97i = ff_snow_vertical_compose97i;
4522 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4523 c->inner_add_yblock = ff_snow_inner_add_yblock;
4524 #endif
4526 #ifdef CONFIG_VORBIS_DECODER
4527 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4528 #endif
4529 #ifdef CONFIG_AC3_DECODER
4530 c->ac3_downmix = ff_ac3_downmix_c;
4531 #endif
4532 #ifdef CONFIG_FLAC_ENCODER
4533 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4534 #endif
4535 c->vector_fmul = vector_fmul_c;
4536 c->vector_fmul_reverse = vector_fmul_reverse_c;
4537 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4538 c->vector_fmul_window = ff_vector_fmul_window_c;
4539 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4540 c->float_to_int16 = ff_float_to_int16_c;
4541 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4542 c->add_int16 = add_int16_c;
4543 c->sub_int16 = sub_int16_c;
4544 c->scalarproduct_int16 = scalarproduct_int16_c;
4546 c->shrink[0]= ff_img_copy_plane;
4547 c->shrink[1]= ff_shrink22;
4548 c->shrink[2]= ff_shrink44;
4549 c->shrink[3]= ff_shrink88;
4551 c->prefetch= just_return;
4553 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4554 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4556 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4557 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4558 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4559 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4560 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4561 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4562 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4563 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4564 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4566 for(i=0; i<64; i++){
4567 if(!c->put_2tap_qpel_pixels_tab[0][i])
4568 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4569 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4570 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4573 switch(c->idct_permutation_type){
4574 case FF_NO_IDCT_PERM:
4575 for(i=0; i<64; i++)
4576 c->idct_permutation[i]= i;
4577 break;
4578 case FF_LIBMPEG2_IDCT_PERM:
4579 for(i=0; i<64; i++)
4580 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4581 break;
4582 case FF_SIMPLE_IDCT_PERM:
4583 for(i=0; i<64; i++)
4584 c->idct_permutation[i]= simple_mmx_permutation[i];
4585 break;
4586 case FF_TRANSPOSE_IDCT_PERM:
4587 for(i=0; i<64; i++)
4588 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4589 break;
4590 case FF_PARTTRANS_IDCT_PERM:
4591 for(i=0; i<64; i++)
4592 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4593 break;
4594 case FF_SSE2_IDCT_PERM:
4595 for(i=0; i<64; i++)
4596 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4597 break;
4598 default:
4599 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");