10l: don't check against current layout until after validating ch_mode.
[FFMpeg-mirror/lagarith.git] / libavcodec / dsputil.c
blob86a6c3779355dfbe01bbd5045bc9f58a124983f2
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file libavcodec/dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "mathops.h"
36 #include "h263.h"
37 #include "snow.h"
39 /* snow.c */
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 /* vorbis.c */
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 /* ac3dec.c */
46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48 /* flacenc.c */
49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51 /* pngdec.c */
52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54 /* eaidct.c */
55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58 uint32_t ff_squareTbl[512] = {0, };
60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61 #define pb_7f (~0UL/255 * 0x7f)
62 #define pb_80 (~0UL/255 * 0x80)
64 const uint8_t ff_zigzag_direct[64] = {
65 0, 1, 8, 16, 9, 2, 3, 10,
66 17, 24, 32, 25, 18, 11, 4, 5,
67 12, 19, 26, 33, 40, 48, 41, 34,
68 27, 20, 13, 6, 7, 14, 21, 28,
69 35, 42, 49, 56, 57, 50, 43, 36,
70 29, 22, 15, 23, 30, 37, 44, 51,
71 58, 59, 52, 45, 38, 31, 39, 46,
72 53, 60, 61, 54, 47, 55, 62, 63
75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
76 specification, we interleave the fields */
77 const uint8_t ff_zigzag248_direct[64] = {
78 0, 8, 1, 9, 16, 24, 2, 10,
79 17, 25, 32, 40, 48, 56, 33, 41,
80 18, 26, 3, 11, 4, 12, 19, 27,
81 34, 42, 49, 57, 50, 58, 35, 43,
82 20, 28, 5, 13, 6, 14, 21, 29,
83 36, 44, 51, 59, 52, 60, 37, 45,
84 22, 30, 7, 15, 23, 31, 38, 46,
85 53, 61, 54, 62, 39, 47, 55, 63,
88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
91 const uint8_t ff_alternate_horizontal_scan[64] = {
92 0, 1, 2, 3, 8, 9, 16, 17,
93 10, 11, 4, 5, 6, 7, 15, 14,
94 13, 12, 19, 18, 24, 25, 32, 33,
95 26, 27, 20, 21, 22, 23, 28, 29,
96 30, 31, 34, 35, 40, 41, 48, 49,
97 42, 43, 36, 37, 38, 39, 44, 45,
98 46, 47, 50, 51, 56, 57, 58, 59,
99 52, 53, 54, 55, 60, 61, 62, 63,
102 const uint8_t ff_alternate_vertical_scan[64] = {
103 0, 8, 16, 24, 1, 9, 2, 10,
104 17, 25, 32, 40, 48, 56, 57, 49,
105 41, 33, 26, 18, 3, 11, 4, 12,
106 19, 27, 34, 42, 50, 58, 35, 43,
107 51, 59, 20, 28, 5, 13, 6, 14,
108 21, 29, 36, 44, 52, 60, 37, 45,
109 53, 61, 22, 30, 7, 15, 23, 31,
110 38, 46, 54, 62, 39, 47, 55, 63,
113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114 const uint32_t ff_inverse[256]={
115 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
116 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
117 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
118 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
119 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
120 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
121 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
122 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
123 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
124 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
125 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
126 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
127 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
128 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
129 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
130 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
131 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
132 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
133 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
134 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
135 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
136 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
137 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
138 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
139 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
140 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
141 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
142 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
143 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
144 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
145 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
146 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
164 int i;
165 int end;
167 st->scantable= src_scantable;
169 for(i=0; i<64; i++){
170 int j;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
173 #if ARCH_PPC
174 st->inverse[j] = i;
175 #endif
178 end=-1;
179 for(i=0; i<64; i++){
180 int j;
181 j = st->permutated[i];
182 if(j>end) end=j;
183 st->raster_end[i]= end;
187 static int pix_sum_c(uint8_t * pix, int line_size)
189 int s, i, j;
191 s = 0;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
194 s += pix[0];
195 s += pix[1];
196 s += pix[2];
197 s += pix[3];
198 s += pix[4];
199 s += pix[5];
200 s += pix[6];
201 s += pix[7];
202 pix += 8;
204 pix += line_size - 16;
206 return s;
209 static int pix_norm1_c(uint8_t * pix, int line_size)
211 int s, i, j;
212 uint32_t *sq = ff_squareTbl + 256;
214 s = 0;
215 for (i = 0; i < 16; i++) {
216 for (j = 0; j < 16; j += 8) {
217 #if 0
218 s += sq[pix[0]];
219 s += sq[pix[1]];
220 s += sq[pix[2]];
221 s += sq[pix[3]];
222 s += sq[pix[4]];
223 s += sq[pix[5]];
224 s += sq[pix[6]];
225 s += sq[pix[7]];
226 #else
227 #if LONG_MAX > 2147483647
228 register uint64_t x=*(uint64_t*)pix;
229 s += sq[x&0xff];
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 s += sq[(x>>32)&0xff];
234 s += sq[(x>>40)&0xff];
235 s += sq[(x>>48)&0xff];
236 s += sq[(x>>56)&0xff];
237 #else
238 register uint32_t x=*(uint32_t*)pix;
239 s += sq[x&0xff];
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 x=*(uint32_t*)(pix+4);
244 s += sq[x&0xff];
245 s += sq[(x>>8)&0xff];
246 s += sq[(x>>16)&0xff];
247 s += sq[(x>>24)&0xff];
248 #endif
249 #endif
250 pix += 8;
252 pix += line_size - 16;
254 return s;
257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
258 int i;
260 for(i=0; i+8<=w; i+=8){
261 dst[i+0]= bswap_32(src[i+0]);
262 dst[i+1]= bswap_32(src[i+1]);
263 dst[i+2]= bswap_32(src[i+2]);
264 dst[i+3]= bswap_32(src[i+3]);
265 dst[i+4]= bswap_32(src[i+4]);
266 dst[i+5]= bswap_32(src[i+5]);
267 dst[i+6]= bswap_32(src[i+6]);
268 dst[i+7]= bswap_32(src[i+7]);
270 for(;i<w; i++){
271 dst[i+0]= bswap_32(src[i+0]);
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 int s, i;
278 uint32_t *sq = ff_squareTbl + 256;
280 s = 0;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 pix1 += line_size;
287 pix2 += line_size;
289 return s;
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 int s, i;
295 uint32_t *sq = ff_squareTbl + 256;
297 s = 0;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
307 pix1 += line_size;
308 pix2 += line_size;
310 return s;
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 int s, i;
316 uint32_t *sq = ff_squareTbl + 256;
318 s = 0;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
337 pix1 += line_size;
338 pix2 += line_size;
340 return s;
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346 int s, i, j;
347 const int dec_count= w==8 ? 3 : 4;
348 int tmp[32*32];
349 int level, ori;
350 static const int scale[2][2][4][4]={
353 // 9/7 8x8 dec=3
354 {268, 239, 239, 213},
355 { 0, 224, 224, 152},
356 { 0, 135, 135, 110},
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
360 { 0, 320, 320, 228},
361 { 0, 175, 175, 136},
362 { 0, 129, 129, 102},
366 // 5/3 8x8 dec=3
367 {275, 245, 245, 218},
368 { 0, 230, 230, 156},
369 { 0, 138, 138, 113},
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
373 { 0, 328, 328, 233},
374 { 0, 180, 180, 140},
375 { 0, 132, 132, 105},
380 for (i = 0; i < h; i++) {
381 for (j = 0; j < w; j+=4) {
382 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
387 pix1 += line_size;
388 pix2 += line_size;
391 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
393 s=0;
394 assert(w==h);
395 for(level=0; level<dec_count; level++){
396 for(ori= level ? 1 : 0; ori<4; ori++){
397 int size= w>>(dec_count-level);
398 int sx= (ori&1) ? size : 0;
399 int stride= 32<<(dec_count-level);
400 int sy= (ori&2) ? stride>>1 : 0;
402 for(i=0; i<size; i++){
403 for(j=0; j<size; j++){
404 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405 s += FFABS(v);
410 assert(s>=0);
411 return s>>9;
414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 1);
418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 8, h, 0);
422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 1);
426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 16, h, 0);
430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 1);
434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 32, h, 0);
437 #endif
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
443 uint8_t *ptr, *last_line;
444 int i;
446 last_line = buf + (height - 1) * wrap;
447 for(i=0;i<w;i++) {
448 /* top and bottom */
449 memcpy(buf - (i + 1) * wrap, buf, width);
450 memcpy(last_line + (i + 1) * wrap, last_line, width);
452 /* left and right */
453 ptr = buf;
454 for(i=0;i<height;i++) {
455 memset(ptr - w, ptr[0], w);
456 memset(ptr + width, ptr[width-1], w);
457 ptr += wrap;
459 /* corners */
460 for(i=0;i<w;i++) {
461 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481 int src_x, int src_y, int w, int h){
482 int x, y;
483 int start_y, start_x, end_y, end_x;
485 if(src_y>= h){
486 src+= (h-1-src_y)*linesize;
487 src_y=h-1;
488 }else if(src_y<=-block_h){
489 src+= (1-block_h-src_y)*linesize;
490 src_y=1-block_h;
492 if(src_x>= w){
493 src+= (w-1-src_x);
494 src_x=w-1;
495 }else if(src_x<=-block_w){
496 src+= (1-block_w-src_x);
497 src_x=1-block_w;
500 start_y= FFMAX(0, -src_y);
501 start_x= FFMAX(0, -src_x);
502 end_y= FFMIN(block_h, h-src_y);
503 end_x= FFMIN(block_w, w-src_x);
505 // copy existing part
506 for(y=start_y; y<end_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= src[x + y*linesize];
512 //top
513 for(y=0; y<start_y; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + start_y*linesize];
519 //bottom
520 for(y=end_y; y<block_h; y++){
521 for(x=start_x; x<end_x; x++){
522 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526 for(y=0; y<block_h; y++){
527 //left
528 for(x=0; x<start_x; x++){
529 buf[x + y*linesize]= buf[start_x + y*linesize];
532 //right
533 for(x=end_x; x<block_w; x++){
534 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
541 int i;
543 /* read the pixels */
544 for(i=0;i<8;i++) {
545 block[0] = pixels[0];
546 block[1] = pixels[1];
547 block[2] = pixels[2];
548 block[3] = pixels[3];
549 block[4] = pixels[4];
550 block[5] = pixels[5];
551 block[6] = pixels[6];
552 block[7] = pixels[7];
553 pixels += line_size;
554 block += 8;
558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559 const uint8_t *s2, int stride){
560 int i;
562 /* read the pixels */
563 for(i=0;i<8;i++) {
564 block[0] = s1[0] - s2[0];
565 block[1] = s1[1] - s2[1];
566 block[2] = s1[2] - s2[2];
567 block[3] = s1[3] - s2[3];
568 block[4] = s1[4] - s2[4];
569 block[5] = s1[5] - s2[5];
570 block[6] = s1[6] - s2[6];
571 block[7] = s1[7] - s2[7];
572 s1 += stride;
573 s2 += stride;
574 block += 8;
579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
580 int line_size)
582 int i;
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
585 /* read the pixels */
586 for(i=0;i<8;i++) {
587 pixels[0] = cm[block[0]];
588 pixels[1] = cm[block[1]];
589 pixels[2] = cm[block[2]];
590 pixels[3] = cm[block[3]];
591 pixels[4] = cm[block[4]];
592 pixels[5] = cm[block[5]];
593 pixels[6] = cm[block[6]];
594 pixels[7] = cm[block[7]];
596 pixels += line_size;
597 block += 8;
601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
602 int line_size)
604 int i;
605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
607 /* read the pixels */
608 for(i=0;i<4;i++) {
609 pixels[0] = cm[block[0]];
610 pixels[1] = cm[block[1]];
611 pixels[2] = cm[block[2]];
612 pixels[3] = cm[block[3]];
614 pixels += line_size;
615 block += 8;
619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
620 int line_size)
622 int i;
623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
625 /* read the pixels */
626 for(i=0;i<2;i++) {
627 pixels[0] = cm[block[0]];
628 pixels[1] = cm[block[1]];
630 pixels += line_size;
631 block += 8;
635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
636 uint8_t *restrict pixels,
637 int line_size)
639 int i, j;
641 for (i = 0; i < 8; i++) {
642 for (j = 0; j < 8; j++) {
643 if (*block < -128)
644 *pixels = 0;
645 else if (*block > 127)
646 *pixels = 255;
647 else
648 *pixels = (uint8_t)(*block + 128);
649 block++;
650 pixels++;
652 pixels += (line_size - 8);
656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
657 int line_size)
659 int i;
660 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
662 /* read the pixels */
663 for(i=0;i<8;i++) {
664 pixels[0] = cm[pixels[0] + block[0]];
665 pixels[1] = cm[pixels[1] + block[1]];
666 pixels[2] = cm[pixels[2] + block[2]];
667 pixels[3] = cm[pixels[3] + block[3]];
668 pixels[4] = cm[pixels[4] + block[4]];
669 pixels[5] = cm[pixels[5] + block[5]];
670 pixels[6] = cm[pixels[6] + block[6]];
671 pixels[7] = cm[pixels[7] + block[7]];
672 pixels += line_size;
673 block += 8;
677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
678 int line_size)
680 int i;
681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
683 /* read the pixels */
684 for(i=0;i<4;i++) {
685 pixels[0] = cm[pixels[0] + block[0]];
686 pixels[1] = cm[pixels[1] + block[1]];
687 pixels[2] = cm[pixels[2] + block[2]];
688 pixels[3] = cm[pixels[3] + block[3]];
689 pixels += line_size;
690 block += 8;
694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
695 int line_size)
697 int i;
698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
700 /* read the pixels */
701 for(i=0;i<2;i++) {
702 pixels[0] = cm[pixels[0] + block[0]];
703 pixels[1] = cm[pixels[1] + block[1]];
704 pixels += line_size;
705 block += 8;
709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
711 int i;
712 for(i=0;i<8;i++) {
713 pixels[0] += block[0];
714 pixels[1] += block[1];
715 pixels[2] += block[2];
716 pixels[3] += block[3];
717 pixels[4] += block[4];
718 pixels[5] += block[5];
719 pixels[6] += block[6];
720 pixels[7] += block[7];
721 pixels += line_size;
722 block += 8;
726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
728 int i;
729 for(i=0;i<4;i++) {
730 pixels[0] += block[0];
731 pixels[1] += block[1];
732 pixels[2] += block[2];
733 pixels[3] += block[3];
734 pixels += line_size;
735 block += 4;
739 static int sum_abs_dctelem_c(DCTELEM *block)
741 int sum=0, i;
742 for(i=0; i<64; i++)
743 sum+= FFABS(block[i]);
744 return sum;
747 #if 0
749 #define PIXOP2(OPNAME, OP) \
750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
752 int i;\
753 for(i=0; i<h; i++){\
754 OP(*((uint64_t*)block), AV_RN64(pixels));\
755 pixels+=line_size;\
756 block +=line_size;\
760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
762 int i;\
763 for(i=0; i<h; i++){\
764 const uint64_t a= AV_RN64(pixels );\
765 const uint64_t b= AV_RN64(pixels+1);\
766 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
767 pixels+=line_size;\
768 block +=line_size;\
772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
774 int i;\
775 for(i=0; i<h; i++){\
776 const uint64_t a= AV_RN64(pixels );\
777 const uint64_t b= AV_RN64(pixels+1);\
778 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
779 pixels+=line_size;\
780 block +=line_size;\
784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
786 int i;\
787 for(i=0; i<h; i++){\
788 const uint64_t a= AV_RN64(pixels );\
789 const uint64_t b= AV_RN64(pixels+line_size);\
790 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
791 pixels+=line_size;\
792 block +=line_size;\
796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
798 int i;\
799 for(i=0; i<h; i++){\
800 const uint64_t a= AV_RN64(pixels );\
801 const uint64_t b= AV_RN64(pixels+line_size);\
802 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
803 pixels+=line_size;\
804 block +=line_size;\
808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
810 int i;\
811 const uint64_t a= AV_RN64(pixels );\
812 const uint64_t b= AV_RN64(pixels+1);\
813 uint64_t l0= (a&0x0303030303030303ULL)\
814 + (b&0x0303030303030303ULL)\
815 + 0x0202020202020202ULL;\
816 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
818 uint64_t l1,h1;\
820 pixels+=line_size;\
821 for(i=0; i<h; i+=2){\
822 uint64_t a= AV_RN64(pixels );\
823 uint64_t b= AV_RN64(pixels+1);\
824 l1= (a&0x0303030303030303ULL)\
825 + (b&0x0303030303030303ULL);\
826 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
829 pixels+=line_size;\
830 block +=line_size;\
831 a= AV_RN64(pixels );\
832 b= AV_RN64(pixels+1);\
833 l0= (a&0x0303030303030303ULL)\
834 + (b&0x0303030303030303ULL)\
835 + 0x0202020202020202ULL;\
836 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
839 pixels+=line_size;\
840 block +=line_size;\
844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
846 int i;\
847 const uint64_t a= AV_RN64(pixels );\
848 const uint64_t b= AV_RN64(pixels+1);\
849 uint64_t l0= (a&0x0303030303030303ULL)\
850 + (b&0x0303030303030303ULL)\
851 + 0x0101010101010101ULL;\
852 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
854 uint64_t l1,h1;\
856 pixels+=line_size;\
857 for(i=0; i<h; i+=2){\
858 uint64_t a= AV_RN64(pixels );\
859 uint64_t b= AV_RN64(pixels+1);\
860 l1= (a&0x0303030303030303ULL)\
861 + (b&0x0303030303030303ULL);\
862 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
865 pixels+=line_size;\
866 block +=line_size;\
867 a= AV_RN64(pixels );\
868 b= AV_RN64(pixels+1);\
869 l0= (a&0x0303030303030303ULL)\
870 + (b&0x0303030303030303ULL)\
871 + 0x0101010101010101ULL;\
872 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
875 pixels+=line_size;\
876 block +=line_size;\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889 #else // 64 bit variant
891 #define PIXOP2(OPNAME, OP) \
892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 int i;\
894 for(i=0; i<h; i++){\
895 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
896 pixels+=line_size;\
897 block +=line_size;\
900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 int i;\
902 for(i=0; i<h; i++){\
903 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
904 pixels+=line_size;\
905 block +=line_size;\
908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 int i;\
910 for(i=0; i<h; i++){\
911 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
912 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
913 pixels+=line_size;\
914 block +=line_size;\
917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922 int src_stride1, int src_stride2, int h){\
923 int i;\
924 for(i=0; i<h; i++){\
925 uint32_t a,b;\
926 a= AV_RN32(&src1[i*src_stride1 ]);\
927 b= AV_RN32(&src2[i*src_stride2 ]);\
928 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
929 a= AV_RN32(&src1[i*src_stride1+4]);\
930 b= AV_RN32(&src2[i*src_stride2+4]);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936 int src_stride1, int src_stride2, int h){\
937 int i;\
938 for(i=0; i<h; i++){\
939 uint32_t a,b;\
940 a= AV_RN32(&src1[i*src_stride1 ]);\
941 b= AV_RN32(&src2[i*src_stride2 ]);\
942 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
943 a= AV_RN32(&src1[i*src_stride1+4]);\
944 b= AV_RN32(&src2[i*src_stride2+4]);\
945 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950 int src_stride1, int src_stride2, int h){\
951 int i;\
952 for(i=0; i<h; i++){\
953 uint32_t a,b;\
954 a= AV_RN32(&src1[i*src_stride1 ]);\
955 b= AV_RN32(&src2[i*src_stride2 ]);\
956 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961 int src_stride1, int src_stride2, int h){\
962 int i;\
963 for(i=0; i<h; i++){\
964 uint32_t a,b;\
965 a= AV_RN16(&src1[i*src_stride1 ]);\
966 b= AV_RN16(&src2[i*src_stride2 ]);\
967 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972 int src_stride1, int src_stride2, int h){\
973 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
974 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978 int src_stride1, int src_stride2, int h){\
979 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
980 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001 int i;\
1002 for(i=0; i<h; i++){\
1003 uint32_t a, b, c, d, l0, l1, h0, h1;\
1004 a= AV_RN32(&src1[i*src_stride1]);\
1005 b= AV_RN32(&src2[i*src_stride2]);\
1006 c= AV_RN32(&src3[i*src_stride3]);\
1007 d= AV_RN32(&src4[i*src_stride4]);\
1008 l0= (a&0x03030303UL)\
1009 + (b&0x03030303UL)\
1010 + 0x02020202UL;\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 l1= (c&0x03030303UL)\
1014 + (d&0x03030303UL);\
1015 h1= ((c&0xFCFCFCFCUL)>>2)\
1016 + ((d&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018 a= AV_RN32(&src1[i*src_stride1+4]);\
1019 b= AV_RN32(&src2[i*src_stride2+4]);\
1020 c= AV_RN32(&src3[i*src_stride3+4]);\
1021 d= AV_RN32(&src4[i*src_stride4+4]);\
1022 l0= (a&0x03030303UL)\
1023 + (b&0x03030303UL)\
1024 + 0x02020202UL;\
1025 h0= ((a&0xFCFCFCFCUL)>>2)\
1026 + ((b&0xFCFCFCFCUL)>>2);\
1027 l1= (c&0x03030303UL)\
1028 + (d&0x03030303UL);\
1029 h1= ((c&0xFCFCFCFCUL)>>2)\
1030 + ((d&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053 int i;\
1054 for(i=0; i<h; i++){\
1055 uint32_t a, b, c, d, l0, l1, h0, h1;\
1056 a= AV_RN32(&src1[i*src_stride1]);\
1057 b= AV_RN32(&src2[i*src_stride2]);\
1058 c= AV_RN32(&src3[i*src_stride3]);\
1059 d= AV_RN32(&src4[i*src_stride4]);\
1060 l0= (a&0x03030303UL)\
1061 + (b&0x03030303UL)\
1062 + 0x01010101UL;\
1063 h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1065 l1= (c&0x03030303UL)\
1066 + (d&0x03030303UL);\
1067 h1= ((c&0xFCFCFCFCUL)>>2)\
1068 + ((d&0xFCFCFCFCUL)>>2);\
1069 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070 a= AV_RN32(&src1[i*src_stride1+4]);\
1071 b= AV_RN32(&src2[i*src_stride2+4]);\
1072 c= AV_RN32(&src3[i*src_stride3+4]);\
1073 d= AV_RN32(&src4[i*src_stride4+4]);\
1074 l0= (a&0x03030303UL)\
1075 + (b&0x03030303UL)\
1076 + 0x01010101UL;\
1077 h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 l1= (c&0x03030303UL)\
1080 + (d&0x03030303UL);\
1081 h1= ((c&0xFCFCFCFCUL)>>2)\
1082 + ((d&0xFCFCFCFCUL)>>2);\
1083 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1099 int i, a0, b0, a1, b1;\
1100 a0= pixels[0];\
1101 b0= pixels[1] + 2;\
1102 a0 += b0;\
1103 b0 += pixels[2];\
1105 pixels+=line_size;\
1106 for(i=0; i<h; i+=2){\
1107 a1= pixels[0];\
1108 b1= pixels[1];\
1109 a1 += b1;\
1110 b1 += pixels[2];\
1112 block[0]= (a1+a0)>>2; /* FIXME non put */\
1113 block[1]= (b1+b0)>>2;\
1115 pixels+=line_size;\
1116 block +=line_size;\
1118 a0= pixels[0];\
1119 b0= pixels[1] + 2;\
1120 a0 += b0;\
1121 b0 += pixels[2];\
1123 block[0]= (a1+a0)>>2;\
1124 block[1]= (b1+b0)>>2;\
1125 pixels+=line_size;\
1126 block +=line_size;\
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1132 int i;\
1133 const uint32_t a= AV_RN32(pixels );\
1134 const uint32_t b= AV_RN32(pixels+1);\
1135 uint32_t l0= (a&0x03030303UL)\
1136 + (b&0x03030303UL)\
1137 + 0x02020202UL;\
1138 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1140 uint32_t l1,h1;\
1142 pixels+=line_size;\
1143 for(i=0; i<h; i+=2){\
1144 uint32_t a= AV_RN32(pixels );\
1145 uint32_t b= AV_RN32(pixels+1);\
1146 l1= (a&0x03030303UL)\
1147 + (b&0x03030303UL);\
1148 h1= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151 pixels+=line_size;\
1152 block +=line_size;\
1153 a= AV_RN32(pixels );\
1154 b= AV_RN32(pixels+1);\
1155 l0= (a&0x03030303UL)\
1156 + (b&0x03030303UL)\
1157 + 0x02020202UL;\
1158 h0= ((a&0xFCFCFCFCUL)>>2)\
1159 + ((b&0xFCFCFCFCUL)>>2);\
1160 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 pixels+=line_size;\
1162 block +=line_size;\
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1168 int j;\
1169 for(j=0; j<2; j++){\
1170 int i;\
1171 const uint32_t a= AV_RN32(pixels );\
1172 const uint32_t b= AV_RN32(pixels+1);\
1173 uint32_t l0= (a&0x03030303UL)\
1174 + (b&0x03030303UL)\
1175 + 0x02020202UL;\
1176 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177 + ((b&0xFCFCFCFCUL)>>2);\
1178 uint32_t l1,h1;\
1180 pixels+=line_size;\
1181 for(i=0; i<h; i+=2){\
1182 uint32_t a= AV_RN32(pixels );\
1183 uint32_t b= AV_RN32(pixels+1);\
1184 l1= (a&0x03030303UL)\
1185 + (b&0x03030303UL);\
1186 h1= ((a&0xFCFCFCFCUL)>>2)\
1187 + ((b&0xFCFCFCFCUL)>>2);\
1188 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1189 pixels+=line_size;\
1190 block +=line_size;\
1191 a= AV_RN32(pixels );\
1192 b= AV_RN32(pixels+1);\
1193 l0= (a&0x03030303UL)\
1194 + (b&0x03030303UL)\
1195 + 0x02020202UL;\
1196 h0= ((a&0xFCFCFCFCUL)>>2)\
1197 + ((b&0xFCFCFCFCUL)>>2);\
1198 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199 pixels+=line_size;\
1200 block +=line_size;\
1202 pixels+=4-line_size*(h+1);\
1203 block +=4-line_size*h;\
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1209 int j;\
1210 for(j=0; j<2; j++){\
1211 int i;\
1212 const uint32_t a= AV_RN32(pixels );\
1213 const uint32_t b= AV_RN32(pixels+1);\
1214 uint32_t l0= (a&0x03030303UL)\
1215 + (b&0x03030303UL)\
1216 + 0x01010101UL;\
1217 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218 + ((b&0xFCFCFCFCUL)>>2);\
1219 uint32_t l1,h1;\
1221 pixels+=line_size;\
1222 for(i=0; i<h; i+=2){\
1223 uint32_t a= AV_RN32(pixels );\
1224 uint32_t b= AV_RN32(pixels+1);\
1225 l1= (a&0x03030303UL)\
1226 + (b&0x03030303UL);\
1227 h1= ((a&0xFCFCFCFCUL)>>2)\
1228 + ((b&0xFCFCFCFCUL)>>2);\
1229 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1230 pixels+=line_size;\
1231 block +=line_size;\
1232 a= AV_RN32(pixels );\
1233 b= AV_RN32(pixels+1);\
1234 l0= (a&0x03030303UL)\
1235 + (b&0x03030303UL)\
1236 + 0x01010101UL;\
1237 h0= ((a&0xFCFCFCFCUL)>>2)\
1238 + ((b&0xFCFCFCFCUL)>>2);\
1239 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1240 pixels+=line_size;\
1241 block +=line_size;\
1243 pixels+=4-line_size*(h+1);\
1244 block +=4-line_size*h;\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #endif
1259 #define op_put(a, b) a = b
1261 PIXOP2(avg, op_avg)
1262 PIXOP2(put, op_put)
1263 #undef op_avg
1264 #undef op_put
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1279 const int A=(16-x16)*(16-y16);
1280 const int B=( x16)*(16-y16);
1281 const int C=(16-x16)*( y16);
1282 const int D=( x16)*( y16);
1283 int i;
1285 for(i=0; i<h; i++)
1287 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1295 dst+= stride;
1296 src+= stride;
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1303 int y, vx, vy;
1304 const int s= 1<<shift;
1306 width--;
1307 height--;
1309 for(y=0; y<h; y++){
1310 int x;
1312 vx= ox;
1313 vy= oy;
1314 for(x=0; x<8; x++){ //XXX FIXME optimize
1315 int src_x, src_y, frac_x, frac_y, index;
1317 src_x= vx>>16;
1318 src_y= vy>>16;
1319 frac_x= src_x&(s-1);
1320 frac_y= src_y&(s-1);
1321 src_x>>=shift;
1322 src_y>>=shift;
1324 if((unsigned)src_x < width){
1325 if((unsigned)src_y < height){
1326 index= src_x + src_y*stride;
1327 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1328 + src[index +1]* frac_x )*(s-frac_y)
1329 + ( src[index+stride ]*(s-frac_x)
1330 + src[index+stride+1]* frac_x )* frac_y
1331 + r)>>(shift*2);
1332 }else{
1333 index= src_x + av_clip(src_y, 0, height)*stride;
1334 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1335 + src[index +1]* frac_x )*s
1336 + r)>>(shift*2);
1338 }else{
1339 if((unsigned)src_y < height){
1340 index= av_clip(src_x, 0, width) + src_y*stride;
1341 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1342 + src[index+stride ]* frac_y )*s
1343 + r)>>(shift*2);
1344 }else{
1345 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346 dst[y*stride + x]= src[index ];
1350 vx+= dxx;
1351 vy+= dyx;
1353 ox += dxy;
1354 oy += dyy;
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 switch(width){
1360 case 2: put_pixels2_c (dst, src, stride, height); break;
1361 case 4: put_pixels4_c (dst, src, stride, height); break;
1362 case 8: put_pixels8_c (dst, src, stride, height); break;
1363 case 16:put_pixels16_c(dst, src, stride, height); break;
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 int i,j;
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1373 src += stride;
1374 dst += stride;
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 int i,j;
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1384 src += stride;
1385 dst += stride;
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 int i,j;
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1395 src += stride;
1396 dst += stride;
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401 int i,j;
1402 for (i=0; i < height; i++) {
1403 for (j=0; j < width; j++) {
1404 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1406 src += stride;
1407 dst += stride;
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412 int i,j;
1413 for (i=0; i < height; i++) {
1414 for (j=0; j < width; j++) {
1415 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1417 src += stride;
1418 dst += stride;
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423 int i,j;
1424 for (i=0; i < height; i++) {
1425 for (j=0; j < width; j++) {
1426 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1428 src += stride;
1429 dst += stride;
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 int i,j;
1435 for (i=0; i < height; i++) {
1436 for (j=0; j < width; j++) {
1437 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1439 src += stride;
1440 dst += stride;
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 int i,j;
1446 for (i=0; i < height; i++) {
1447 for (j=0; j < width; j++) {
1448 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1450 src += stride;
1451 dst += stride;
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 switch(width){
1457 case 2: avg_pixels2_c (dst, src, stride, height); break;
1458 case 4: avg_pixels4_c (dst, src, stride, height); break;
1459 case 8: avg_pixels8_c (dst, src, stride, height); break;
1460 case 16:avg_pixels16_c(dst, src, stride, height); break;
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465 int i,j;
1466 for (i=0; i < height; i++) {
1467 for (j=0; j < width; j++) {
1468 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1470 src += stride;
1471 dst += stride;
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476 int i,j;
1477 for (i=0; i < height; i++) {
1478 for (j=0; j < width; j++) {
1479 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1481 src += stride;
1482 dst += stride;
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487 int i,j;
1488 for (i=0; i < height; i++) {
1489 for (j=0; j < width; j++) {
1490 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1492 src += stride;
1493 dst += stride;
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498 int i,j;
1499 for (i=0; i < height; i++) {
1500 for (j=0; j < width; j++) {
1501 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1503 src += stride;
1504 dst += stride;
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509 int i,j;
1510 for (i=0; i < height; i++) {
1511 for (j=0; j < width; j++) {
1512 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1514 src += stride;
1515 dst += stride;
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520 int i,j;
1521 for (i=0; i < height; i++) {
1522 for (j=0; j < width; j++) {
1523 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1525 src += stride;
1526 dst += stride;
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531 int i,j;
1532 for (i=0; i < height; i++) {
1533 for (j=0; j < width; j++) {
1534 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1536 src += stride;
1537 dst += stride;
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542 int i,j;
1543 for (i=0; i < height; i++) {
1544 for (j=0; j < width; j++) {
1545 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1547 src += stride;
1548 dst += stride;
1551 #if 0
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1571 #endif
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575 const int A=(8-x)*(8-y);\
1576 const int B=( x)*(8-y);\
1577 const int C=(8-x)*( y);\
1578 const int D=( x)*( y);\
1579 int i;\
1581 assert(x<8 && y<8 && x>=0 && y>=0);\
1583 if(D){\
1584 for(i=0; i<h; i++){\
1585 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1587 dst+= stride;\
1588 src+= stride;\
1590 }else{\
1591 const int E= B+C;\
1592 const int step= C ? stride : 1;\
1593 for(i=0; i<h; i++){\
1594 OP(dst[0], (A*src[0] + E*src[step+0]));\
1595 OP(dst[1], (A*src[1] + E*src[step+1]));\
1596 dst+= stride;\
1597 src+= stride;\
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603 const int A=(8-x)*(8-y);\
1604 const int B=( x)*(8-y);\
1605 const int C=(8-x)*( y);\
1606 const int D=( x)*( y);\
1607 int i;\
1609 assert(x<8 && y<8 && x>=0 && y>=0);\
1611 if(D){\
1612 for(i=0; i<h; i++){\
1613 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1617 dst+= stride;\
1618 src+= stride;\
1620 }else{\
1621 const int E= B+C;\
1622 const int step= C ? stride : 1;\
1623 for(i=0; i<h; i++){\
1624 OP(dst[0], (A*src[0] + E*src[step+0]));\
1625 OP(dst[1], (A*src[1] + E*src[step+1]));\
1626 OP(dst[2], (A*src[2] + E*src[step+2]));\
1627 OP(dst[3], (A*src[3] + E*src[step+3]));\
1628 dst+= stride;\
1629 src+= stride;\
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635 const int A=(8-x)*(8-y);\
1636 const int B=( x)*(8-y);\
1637 const int C=(8-x)*( y);\
1638 const int D=( x)*( y);\
1639 int i;\
1641 assert(x<8 && y<8 && x>=0 && y>=0);\
1643 if(D){\
1644 for(i=0; i<h; i++){\
1645 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1653 dst+= stride;\
1654 src+= stride;\
1656 }else{\
1657 const int E= B+C;\
1658 const int step= C ? stride : 1;\
1659 for(i=0; i<h; i++){\
1660 OP(dst[0], (A*src[0] + E*src[step+0]));\
1661 OP(dst[1], (A*src[1] + E*src[step+1]));\
1662 OP(dst[2], (A*src[2] + E*src[step+2]));\
1663 OP(dst[3], (A*src[3] + E*src[step+3]));\
1664 OP(dst[4], (A*src[4] + E*src[step+4]));\
1665 OP(dst[5], (A*src[5] + E*src[step+5]));\
1666 OP(dst[6], (A*src[6] + E*src[step+6]));\
1667 OP(dst[7], (A*src[7] + E*src[step+7]));\
1668 dst+= stride;\
1669 src+= stride;\
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1677 H264_CHROMA_MC(put_ , op_put)
1678 H264_CHROMA_MC(avg_ , op_avg)
1679 #undef op_avg
1680 #undef op_put
1682 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683 const int A=(8-x)*(8-y);
1684 const int B=( x)*(8-y);
1685 const int C=(8-x)*( y);
1686 const int D=( x)*( y);
1687 int i;
1689 assert(x<8 && y<8 && x>=0 && y>=0);
1691 for(i=0; i<h; i++)
1693 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1701 dst+= stride;
1702 src+= stride;
1706 #define QPEL_MC(r, OPNAME, RND, OP) \
1707 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1709 int i;\
1710 for(i=0; i<h; i++)\
1712 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1720 dst+=dstStride;\
1721 src+=srcStride;\
1725 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726 const int w=8;\
1727 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1728 int i;\
1729 for(i=0; i<w; i++)\
1731 const int src0= src[0*srcStride];\
1732 const int src1= src[1*srcStride];\
1733 const int src2= src[2*srcStride];\
1734 const int src3= src[3*srcStride];\
1735 const int src4= src[4*srcStride];\
1736 const int src5= src[5*srcStride];\
1737 const int src6= src[6*srcStride];\
1738 const int src7= src[7*srcStride];\
1739 const int src8= src[8*srcStride];\
1740 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1748 dst++;\
1749 src++;\
1753 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1755 int i;\
1757 for(i=0; i<h; i++)\
1759 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1775 dst+=dstStride;\
1776 src+=srcStride;\
1780 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782 int i;\
1783 const int w=16;\
1784 for(i=0; i<w; i++)\
1786 const int src0= src[0*srcStride];\
1787 const int src1= src[1*srcStride];\
1788 const int src2= src[2*srcStride];\
1789 const int src3= src[3*srcStride];\
1790 const int src4= src[4*srcStride];\
1791 const int src5= src[5*srcStride];\
1792 const int src6= src[6*srcStride];\
1793 const int src7= src[7*srcStride];\
1794 const int src8= src[8*srcStride];\
1795 const int src9= src[9*srcStride];\
1796 const int src10= src[10*srcStride];\
1797 const int src11= src[11*srcStride];\
1798 const int src12= src[12*srcStride];\
1799 const int src13= src[13*srcStride];\
1800 const int src14= src[14*srcStride];\
1801 const int src15= src[15*srcStride];\
1802 const int src16= src[16*srcStride];\
1803 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1819 dst++;\
1820 src++;\
1824 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825 OPNAME ## pixels8_c(dst, src, stride, 8);\
1828 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829 uint8_t half[64];\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1834 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1838 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839 uint8_t half[64];\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1844 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1846 uint8_t half[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1852 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t full[16*9];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1858 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859 uint8_t full[16*9];\
1860 uint8_t half[64];\
1861 copy_block9(full, src, 16, stride, 9);\
1862 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1865 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866 uint8_t full[16*9];\
1867 uint8_t halfH[72];\
1868 uint8_t halfV[64];\
1869 uint8_t halfHV[64];\
1870 copy_block9(full, src, 16, stride, 9);\
1871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1876 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[16*9];\
1878 uint8_t halfH[72];\
1879 uint8_t halfHV[64];\
1880 copy_block9(full, src, 16, stride, 9);\
1881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1886 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[16*9];\
1888 uint8_t halfH[72];\
1889 uint8_t halfV[64];\
1890 uint8_t halfHV[64];\
1891 copy_block9(full, src, 16, stride, 9);\
1892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1897 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[16*9];\
1899 uint8_t halfH[72];\
1900 uint8_t halfHV[64];\
1901 copy_block9(full, src, 16, stride, 9);\
1902 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1907 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[16*9];\
1909 uint8_t halfH[72];\
1910 uint8_t halfV[64];\
1911 uint8_t halfHV[64];\
1912 copy_block9(full, src, 16, stride, 9);\
1913 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1918 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1920 uint8_t halfH[72];\
1921 uint8_t halfHV[64];\
1922 copy_block9(full, src, 16, stride, 9);\
1923 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1928 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1930 uint8_t halfH[72];\
1931 uint8_t halfV[64];\
1932 uint8_t halfHV[64];\
1933 copy_block9(full, src, 16, stride, 9);\
1934 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1939 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[16*9];\
1941 uint8_t halfH[72];\
1942 uint8_t halfHV[64];\
1943 copy_block9(full, src, 16, stride, 9);\
1944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1949 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t halfH[72];\
1951 uint8_t halfHV[64];\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1956 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t halfH[72];\
1958 uint8_t halfHV[64];\
1959 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1963 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[16*9];\
1965 uint8_t halfH[72];\
1966 uint8_t halfV[64];\
1967 uint8_t halfHV[64];\
1968 copy_block9(full, src, 16, stride, 9);\
1969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1974 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[16*9];\
1976 uint8_t halfH[72];\
1977 copy_block9(full, src, 16, stride, 9);\
1978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1982 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[16*9];\
1984 uint8_t halfH[72];\
1985 uint8_t halfV[64];\
1986 uint8_t halfHV[64];\
1987 copy_block9(full, src, 16, stride, 9);\
1988 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1993 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t full[16*9];\
1995 uint8_t halfH[72];\
1996 copy_block9(full, src, 16, stride, 9);\
1997 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2001 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t halfH[72];\
2003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2006 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007 OPNAME ## pixels16_c(dst, src, stride, 16);\
2010 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011 uint8_t half[256];\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2016 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2020 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t half[256];\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2026 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2028 uint8_t half[256];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2034 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t full[24*17];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2040 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[24*17];\
2042 uint8_t half[256];\
2043 copy_block17(full, src, 24, stride, 17);\
2044 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2047 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048 uint8_t full[24*17];\
2049 uint8_t halfH[272];\
2050 uint8_t halfV[256];\
2051 uint8_t halfHV[256];\
2052 copy_block17(full, src, 24, stride, 17);\
2053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2058 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059 uint8_t full[24*17];\
2060 uint8_t halfH[272];\
2061 uint8_t halfHV[256];\
2062 copy_block17(full, src, 24, stride, 17);\
2063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2068 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[24*17];\
2070 uint8_t halfH[272];\
2071 uint8_t halfV[256];\
2072 uint8_t halfHV[256];\
2073 copy_block17(full, src, 24, stride, 17);\
2074 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2079 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t full[24*17];\
2081 uint8_t halfH[272];\
2082 uint8_t halfHV[256];\
2083 copy_block17(full, src, 24, stride, 17);\
2084 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2089 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090 uint8_t full[24*17];\
2091 uint8_t halfH[272];\
2092 uint8_t halfV[256];\
2093 uint8_t halfHV[256];\
2094 copy_block17(full, src, 24, stride, 17);\
2095 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2100 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101 uint8_t full[24*17];\
2102 uint8_t halfH[272];\
2103 uint8_t halfHV[256];\
2104 copy_block17(full, src, 24, stride, 17);\
2105 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2110 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111 uint8_t full[24*17];\
2112 uint8_t halfH[272];\
2113 uint8_t halfV[256];\
2114 uint8_t halfHV[256];\
2115 copy_block17(full, src, 24, stride, 17);\
2116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2121 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t full[24*17];\
2123 uint8_t halfH[272];\
2124 uint8_t halfHV[256];\
2125 copy_block17(full, src, 24, stride, 17);\
2126 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2131 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132 uint8_t halfH[272];\
2133 uint8_t halfHV[256];\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2138 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t halfH[272];\
2140 uint8_t halfHV[256];\
2141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2145 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146 uint8_t full[24*17];\
2147 uint8_t halfH[272];\
2148 uint8_t halfV[256];\
2149 uint8_t halfHV[256];\
2150 copy_block17(full, src, 24, stride, 17);\
2151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2156 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157 uint8_t full[24*17];\
2158 uint8_t halfH[272];\
2159 copy_block17(full, src, 24, stride, 17);\
2160 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2164 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165 uint8_t full[24*17];\
2166 uint8_t halfH[272];\
2167 uint8_t halfV[256];\
2168 uint8_t halfHV[256];\
2169 copy_block17(full, src, 24, stride, 17);\
2170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2175 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176 uint8_t full[24*17];\
2177 uint8_t halfH[272];\
2178 copy_block17(full, src, 24, stride, 17);\
2179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t halfH[272];\
2185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2189 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191 #define op_put(a, b) a = cm[((b) + 16)>>5]
2192 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2194 QPEL_MC(0, put_ , _ , op_put)
2195 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196 QPEL_MC(0, avg_ , _ , op_avg)
2197 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2198 #undef op_avg
2199 #undef op_avg_no_rnd
2200 #undef op_put
2201 #undef op_put_no_rnd
2203 #if 1
2204 #define H264_LOWPASS(OPNAME, OP, OP2) \
2205 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206 const int h=2;\
2207 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 int i;\
2209 for(i=0; i<h; i++)\
2211 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213 dst+=dstStride;\
2214 src+=srcStride;\
2218 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219 const int w=2;\
2220 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2221 int i;\
2222 for(i=0; i<w; i++)\
2224 const int srcB= src[-2*srcStride];\
2225 const int srcA= src[-1*srcStride];\
2226 const int src0= src[0 *srcStride];\
2227 const int src1= src[1 *srcStride];\
2228 const int src2= src[2 *srcStride];\
2229 const int src3= src[3 *srcStride];\
2230 const int src4= src[4 *srcStride];\
2231 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2233 dst++;\
2234 src++;\
2238 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2239 const int h=2;\
2240 const int w=2;\
2241 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242 int i;\
2243 src -= 2*srcStride;\
2244 for(i=0; i<h+5; i++)\
2246 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2248 tmp+=tmpStride;\
2249 src+=srcStride;\
2251 tmp -= tmpStride*(h+5-2);\
2252 for(i=0; i<w; i++)\
2254 const int tmpB= tmp[-2*tmpStride];\
2255 const int tmpA= tmp[-1*tmpStride];\
2256 const int tmp0= tmp[0 *tmpStride];\
2257 const int tmp1= tmp[1 *tmpStride];\
2258 const int tmp2= tmp[2 *tmpStride];\
2259 const int tmp3= tmp[3 *tmpStride];\
2260 const int tmp4= tmp[4 *tmpStride];\
2261 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2263 dst++;\
2264 tmp++;\
2267 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268 const int h=4;\
2269 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270 int i;\
2271 for(i=0; i<h; i++)\
2273 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2277 dst+=dstStride;\
2278 src+=srcStride;\
2282 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283 const int w=4;\
2284 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2285 int i;\
2286 for(i=0; i<w; i++)\
2288 const int srcB= src[-2*srcStride];\
2289 const int srcA= src[-1*srcStride];\
2290 const int src0= src[0 *srcStride];\
2291 const int src1= src[1 *srcStride];\
2292 const int src2= src[2 *srcStride];\
2293 const int src3= src[3 *srcStride];\
2294 const int src4= src[4 *srcStride];\
2295 const int src5= src[5 *srcStride];\
2296 const int src6= src[6 *srcStride];\
2297 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2301 dst++;\
2302 src++;\
2306 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2307 const int h=4;\
2308 const int w=4;\
2309 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310 int i;\
2311 src -= 2*srcStride;\
2312 for(i=0; i<h+5; i++)\
2314 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2318 tmp+=tmpStride;\
2319 src+=srcStride;\
2321 tmp -= tmpStride*(h+5-2);\
2322 for(i=0; i<w; i++)\
2324 const int tmpB= tmp[-2*tmpStride];\
2325 const int tmpA= tmp[-1*tmpStride];\
2326 const int tmp0= tmp[0 *tmpStride];\
2327 const int tmp1= tmp[1 *tmpStride];\
2328 const int tmp2= tmp[2 *tmpStride];\
2329 const int tmp3= tmp[3 *tmpStride];\
2330 const int tmp4= tmp[4 *tmpStride];\
2331 const int tmp5= tmp[5 *tmpStride];\
2332 const int tmp6= tmp[6 *tmpStride];\
2333 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2337 dst++;\
2338 tmp++;\
2342 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343 const int h=8;\
2344 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2345 int i;\
2346 for(i=0; i<h; i++)\
2348 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2356 dst+=dstStride;\
2357 src+=srcStride;\
2361 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362 const int w=8;\
2363 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2364 int i;\
2365 for(i=0; i<w; i++)\
2367 const int srcB= src[-2*srcStride];\
2368 const int srcA= src[-1*srcStride];\
2369 const int src0= src[0 *srcStride];\
2370 const int src1= src[1 *srcStride];\
2371 const int src2= src[2 *srcStride];\
2372 const int src3= src[3 *srcStride];\
2373 const int src4= src[4 *srcStride];\
2374 const int src5= src[5 *srcStride];\
2375 const int src6= src[6 *srcStride];\
2376 const int src7= src[7 *srcStride];\
2377 const int src8= src[8 *srcStride];\
2378 const int src9= src[9 *srcStride];\
2379 const int src10=src[10*srcStride];\
2380 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2388 dst++;\
2389 src++;\
2393 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2394 const int h=8;\
2395 const int w=8;\
2396 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397 int i;\
2398 src -= 2*srcStride;\
2399 for(i=0; i<h+5; i++)\
2401 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2409 tmp+=tmpStride;\
2410 src+=srcStride;\
2412 tmp -= tmpStride*(h+5-2);\
2413 for(i=0; i<w; i++)\
2415 const int tmpB= tmp[-2*tmpStride];\
2416 const int tmpA= tmp[-1*tmpStride];\
2417 const int tmp0= tmp[0 *tmpStride];\
2418 const int tmp1= tmp[1 *tmpStride];\
2419 const int tmp2= tmp[2 *tmpStride];\
2420 const int tmp3= tmp[3 *tmpStride];\
2421 const int tmp4= tmp[4 *tmpStride];\
2422 const int tmp5= tmp[5 *tmpStride];\
2423 const int tmp6= tmp[6 *tmpStride];\
2424 const int tmp7= tmp[7 *tmpStride];\
2425 const int tmp8= tmp[8 *tmpStride];\
2426 const int tmp9= tmp[9 *tmpStride];\
2427 const int tmp10=tmp[10*tmpStride];\
2428 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2436 dst++;\
2437 tmp++;\
2441 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2443 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444 src += 8*srcStride;\
2445 dst += 8*dstStride;\
2446 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2447 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2450 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2452 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453 src += 8*srcStride;\
2454 dst += 8*dstStride;\
2455 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2456 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2459 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2461 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462 src += 8*srcStride;\
2463 dst += 8*dstStride;\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2465 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2468 #define H264_MC(OPNAME, SIZE) \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2473 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474 uint8_t half[SIZE*SIZE];\
2475 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484 uint8_t half[SIZE*SIZE];\
2485 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490 uint8_t full[SIZE*(SIZE+5)];\
2491 uint8_t * const full_mid= full + SIZE*2;\
2492 uint8_t half[SIZE*SIZE];\
2493 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2494 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499 uint8_t full[SIZE*(SIZE+5)];\
2500 uint8_t * const full_mid= full + SIZE*2;\
2501 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2502 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506 uint8_t full[SIZE*(SIZE+5)];\
2507 uint8_t * const full_mid= full + SIZE*2;\
2508 uint8_t half[SIZE*SIZE];\
2509 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2510 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2514 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515 uint8_t full[SIZE*(SIZE+5)];\
2516 uint8_t * const full_mid= full + SIZE*2;\
2517 uint8_t halfH[SIZE*SIZE];\
2518 uint8_t halfV[SIZE*SIZE];\
2519 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 uint8_t halfH[SIZE*SIZE];\
2529 uint8_t halfV[SIZE*SIZE];\
2530 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2532 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537 uint8_t full[SIZE*(SIZE+5)];\
2538 uint8_t * const full_mid= full + SIZE*2;\
2539 uint8_t halfH[SIZE*SIZE];\
2540 uint8_t halfV[SIZE*SIZE];\
2541 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2543 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548 uint8_t full[SIZE*(SIZE+5)];\
2549 uint8_t * const full_mid= full + SIZE*2;\
2550 uint8_t halfH[SIZE*SIZE];\
2551 uint8_t halfV[SIZE*SIZE];\
2552 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2554 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559 int16_t tmp[SIZE*(SIZE+5)];\
2560 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564 int16_t tmp[SIZE*(SIZE+5)];\
2565 uint8_t halfH[SIZE*SIZE];\
2566 uint8_t halfHV[SIZE*SIZE];\
2567 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573 int16_t tmp[SIZE*(SIZE+5)];\
2574 uint8_t halfH[SIZE*SIZE];\
2575 uint8_t halfHV[SIZE*SIZE];\
2576 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2581 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582 uint8_t full[SIZE*(SIZE+5)];\
2583 uint8_t * const full_mid= full + SIZE*2;\
2584 int16_t tmp[SIZE*(SIZE+5)];\
2585 uint8_t halfV[SIZE*SIZE];\
2586 uint8_t halfHV[SIZE*SIZE];\
2587 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2588 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2593 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594 uint8_t full[SIZE*(SIZE+5)];\
2595 uint8_t * const full_mid= full + SIZE*2;\
2596 int16_t tmp[SIZE*(SIZE+5)];\
2597 uint8_t halfV[SIZE*SIZE];\
2598 uint8_t halfHV[SIZE*SIZE];\
2599 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2600 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2605 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607 #define op_put(a, b) a = cm[((b) + 16)>>5]
2608 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2611 H264_LOWPASS(put_ , op_put, op2_put)
2612 H264_LOWPASS(avg_ , op_avg, op2_avg)
2613 H264_MC(put_, 2)
2614 H264_MC(put_, 4)
2615 H264_MC(put_, 8)
2616 H264_MC(put_, 16)
2617 H264_MC(avg_, 4)
2618 H264_MC(avg_, 8)
2619 H264_MC(avg_, 16)
2621 #undef op_avg
2622 #undef op_put
2623 #undef op2_avg
2624 #undef op2_put
2625 #endif
2627 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629 #define H264_WEIGHT(W,H) \
2630 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631 int y; \
2632 offset <<= log2_denom; \
2633 if(log2_denom) offset += 1<<(log2_denom-1); \
2634 for(y=0; y<H; y++, block += stride){ \
2635 op_scale1(0); \
2636 op_scale1(1); \
2637 if(W==2) continue; \
2638 op_scale1(2); \
2639 op_scale1(3); \
2640 if(W==4) continue; \
2641 op_scale1(4); \
2642 op_scale1(5); \
2643 op_scale1(6); \
2644 op_scale1(7); \
2645 if(W==8) continue; \
2646 op_scale1(8); \
2647 op_scale1(9); \
2648 op_scale1(10); \
2649 op_scale1(11); \
2650 op_scale1(12); \
2651 op_scale1(13); \
2652 op_scale1(14); \
2653 op_scale1(15); \
2656 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657 int y; \
2658 offset = ((offset + 1) | 1) << log2_denom; \
2659 for(y=0; y<H; y++, dst += stride, src += stride){ \
2660 op_scale2(0); \
2661 op_scale2(1); \
2662 if(W==2) continue; \
2663 op_scale2(2); \
2664 op_scale2(3); \
2665 if(W==4) continue; \
2666 op_scale2(4); \
2667 op_scale2(5); \
2668 op_scale2(6); \
2669 op_scale2(7); \
2670 if(W==8) continue; \
2671 op_scale2(8); \
2672 op_scale2(9); \
2673 op_scale2(10); \
2674 op_scale2(11); \
2675 op_scale2(12); \
2676 op_scale2(13); \
2677 op_scale2(14); \
2678 op_scale2(15); \
2682 H264_WEIGHT(16,16)
2683 H264_WEIGHT(16,8)
2684 H264_WEIGHT(8,16)
2685 H264_WEIGHT(8,8)
2686 H264_WEIGHT(8,4)
2687 H264_WEIGHT(4,8)
2688 H264_WEIGHT(4,4)
2689 H264_WEIGHT(4,2)
2690 H264_WEIGHT(2,4)
2691 H264_WEIGHT(2,2)
2693 #undef op_scale1
2694 #undef op_scale2
2695 #undef H264_WEIGHT
2697 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2699 int i;
2701 for(i=0; i<h; i++){
2702 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2710 dst+=dstStride;
2711 src+=srcStride;
2715 #if CONFIG_CAVS_DECODER
2716 /* AVS specific */
2717 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2719 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720 put_pixels8_c(dst, src, stride, 8);
2722 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723 avg_pixels8_c(dst, src, stride, 8);
2725 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726 put_pixels16_c(dst, src, stride, 16);
2728 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729 avg_pixels16_c(dst, src, stride, 16);
2731 #endif /* CONFIG_CAVS_DECODER */
2733 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2734 /* VC-1 specific */
2735 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2737 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738 put_pixels8_c(dst, src, stride, 8);
2740 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2742 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2744 /* H264 specific */
2745 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2747 #if CONFIG_RV30_DECODER
2748 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749 #endif /* CONFIG_RV30_DECODER */
2751 #if CONFIG_RV40_DECODER
2752 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753 put_pixels16_xy2_c(dst, src, stride, 16);
2755 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756 avg_pixels16_xy2_c(dst, src, stride, 16);
2758 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759 put_pixels8_xy2_c(dst, src, stride, 8);
2761 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762 avg_pixels8_xy2_c(dst, src, stride, 8);
2765 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 #endif /* CONFIG_RV40_DECODER */
2768 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2770 int i;
2772 for(i=0; i<w; i++){
2773 const int src_1= src[ -srcStride];
2774 const int src0 = src[0 ];
2775 const int src1 = src[ srcStride];
2776 const int src2 = src[2*srcStride];
2777 const int src3 = src[3*srcStride];
2778 const int src4 = src[4*srcStride];
2779 const int src5 = src[5*srcStride];
2780 const int src6 = src[6*srcStride];
2781 const int src7 = src[7*srcStride];
2782 const int src8 = src[8*srcStride];
2783 const int src9 = src[9*srcStride];
2784 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2786 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2787 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2788 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2789 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2790 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2791 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2792 src++;
2793 dst++;
2797 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798 put_pixels8_c(dst, src, stride, 8);
2801 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2802 uint8_t half[64];
2803 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2807 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2811 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2812 uint8_t half[64];
2813 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2817 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2821 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2822 uint8_t halfH[88];
2823 uint8_t halfV[64];
2824 uint8_t halfHV[64];
2825 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2830 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2831 uint8_t halfH[88];
2832 uint8_t halfV[64];
2833 uint8_t halfHV[64];
2834 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2839 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2840 uint8_t halfH[88];
2841 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2845 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846 if(CONFIG_ANY_H263) {
2847 int x;
2848 const int strength= ff_h263_loop_filter_strength[qscale];
2850 for(x=0; x<8; x++){
2851 int d1, d2, ad1;
2852 int p0= src[x-2*stride];
2853 int p1= src[x-1*stride];
2854 int p2= src[x+0*stride];
2855 int p3= src[x+1*stride];
2856 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2858 if (d<-2*strength) d1= 0;
2859 else if(d<- strength) d1=-2*strength - d;
2860 else if(d< strength) d1= d;
2861 else if(d< 2*strength) d1= 2*strength - d;
2862 else d1= 0;
2864 p1 += d1;
2865 p2 -= d1;
2866 if(p1&256) p1= ~(p1>>31);
2867 if(p2&256) p2= ~(p2>>31);
2869 src[x-1*stride] = p1;
2870 src[x+0*stride] = p2;
2872 ad1= FFABS(d1)>>1;
2874 d2= av_clip((p0-p3)/4, -ad1, ad1);
2876 src[x-2*stride] = p0 - d2;
2877 src[x+ stride] = p3 + d2;
2882 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883 if(CONFIG_ANY_H263) {
2884 int y;
2885 const int strength= ff_h263_loop_filter_strength[qscale];
2887 for(y=0; y<8; y++){
2888 int d1, d2, ad1;
2889 int p0= src[y*stride-2];
2890 int p1= src[y*stride-1];
2891 int p2= src[y*stride+0];
2892 int p3= src[y*stride+1];
2893 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2895 if (d<-2*strength) d1= 0;
2896 else if(d<- strength) d1=-2*strength - d;
2897 else if(d< strength) d1= d;
2898 else if(d< 2*strength) d1= 2*strength - d;
2899 else d1= 0;
2901 p1 += d1;
2902 p2 -= d1;
2903 if(p1&256) p1= ~(p1>>31);
2904 if(p2&256) p2= ~(p2>>31);
2906 src[y*stride-1] = p1;
2907 src[y*stride+0] = p2;
2909 ad1= FFABS(d1)>>1;
2911 d2= av_clip((p0-p3)/4, -ad1, ad1);
2913 src[y*stride-2] = p0 - d2;
2914 src[y*stride+1] = p3 + d2;
2919 static void h261_loop_filter_c(uint8_t *src, int stride){
2920 int x,y,xy,yz;
2921 int temp[64];
2923 for(x=0; x<8; x++){
2924 temp[x ] = 4*src[x ];
2925 temp[x + 7*8] = 4*src[x + 7*stride];
2927 for(y=1; y<7; y++){
2928 for(x=0; x<8; x++){
2929 xy = y * stride + x;
2930 yz = y * 8 + x;
2931 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2935 for(y=0; y<8; y++){
2936 src[ y*stride] = (temp[ y*8] + 2)>>2;
2937 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2938 for(x=1; x<7; x++){
2939 xy = y * stride + x;
2940 yz = y * 8 + x;
2941 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2946 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2948 int i, d;
2949 for( i = 0; i < 4; i++ ) {
2950 if( tc0[i] < 0 ) {
2951 pix += 4*ystride;
2952 continue;
2954 for( d = 0; d < 4; d++ ) {
2955 const int p0 = pix[-1*xstride];
2956 const int p1 = pix[-2*xstride];
2957 const int p2 = pix[-3*xstride];
2958 const int q0 = pix[0];
2959 const int q1 = pix[1*xstride];
2960 const int q2 = pix[2*xstride];
2962 if( FFABS( p0 - q0 ) < alpha &&
2963 FFABS( p1 - p0 ) < beta &&
2964 FFABS( q1 - q0 ) < beta ) {
2966 int tc = tc0[i];
2967 int i_delta;
2969 if( FFABS( p2 - p0 ) < beta ) {
2970 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2971 tc++;
2973 if( FFABS( q2 - q0 ) < beta ) {
2974 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2975 tc++;
2978 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2980 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2982 pix += ystride;
2986 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2988 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2990 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2992 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2995 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2997 int d;
2998 for( d = 0; d < 16; d++ ) {
2999 const int p2 = pix[-3*xstride];
3000 const int p1 = pix[-2*xstride];
3001 const int p0 = pix[-1*xstride];
3003 const int q0 = pix[ 0*xstride];
3004 const int q1 = pix[ 1*xstride];
3005 const int q2 = pix[ 2*xstride];
3007 if( FFABS( p0 - q0 ) < alpha &&
3008 FFABS( p1 - p0 ) < beta &&
3009 FFABS( q1 - q0 ) < beta ) {
3011 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012 if( FFABS( p2 - p0 ) < beta)
3014 const int p3 = pix[-4*xstride];
3015 /* p0', p1', p2' */
3016 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3019 } else {
3020 /* p0' */
3021 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3023 if( FFABS( q2 - q0 ) < beta)
3025 const int q3 = pix[3*xstride];
3026 /* q0', q1', q2' */
3027 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3030 } else {
3031 /* q0' */
3032 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3034 }else{
3035 /* p0', q0' */
3036 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3040 pix += ystride;
3043 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3045 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3047 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3049 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3052 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3054 int i, d;
3055 for( i = 0; i < 4; i++ ) {
3056 const int tc = tc0[i];
3057 if( tc <= 0 ) {
3058 pix += 2*ystride;
3059 continue;
3061 for( d = 0; d < 2; d++ ) {
3062 const int p0 = pix[-1*xstride];
3063 const int p1 = pix[-2*xstride];
3064 const int q0 = pix[0];
3065 const int q1 = pix[1*xstride];
3067 if( FFABS( p0 - q0 ) < alpha &&
3068 FFABS( p1 - p0 ) < beta &&
3069 FFABS( q1 - q0 ) < beta ) {
3071 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3073 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3074 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3076 pix += ystride;
3080 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3082 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3084 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3086 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3089 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3091 int d;
3092 for( d = 0; d < 8; d++ ) {
3093 const int p0 = pix[-1*xstride];
3094 const int p1 = pix[-2*xstride];
3095 const int q0 = pix[0];
3096 const int q1 = pix[1*xstride];
3098 if( FFABS( p0 - q0 ) < alpha &&
3099 FFABS( p1 - p0 ) < beta &&
3100 FFABS( q1 - q0 ) < beta ) {
3102 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3103 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3105 pix += ystride;
3108 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3110 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3112 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3114 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3117 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3119 int s, i;
3121 s = 0;
3122 for(i=0;i<h;i++) {
3123 s += abs(pix1[0] - pix2[0]);
3124 s += abs(pix1[1] - pix2[1]);
3125 s += abs(pix1[2] - pix2[2]);
3126 s += abs(pix1[3] - pix2[3]);
3127 s += abs(pix1[4] - pix2[4]);
3128 s += abs(pix1[5] - pix2[5]);
3129 s += abs(pix1[6] - pix2[6]);
3130 s += abs(pix1[7] - pix2[7]);
3131 s += abs(pix1[8] - pix2[8]);
3132 s += abs(pix1[9] - pix2[9]);
3133 s += abs(pix1[10] - pix2[10]);
3134 s += abs(pix1[11] - pix2[11]);
3135 s += abs(pix1[12] - pix2[12]);
3136 s += abs(pix1[13] - pix2[13]);
3137 s += abs(pix1[14] - pix2[14]);
3138 s += abs(pix1[15] - pix2[15]);
3139 pix1 += line_size;
3140 pix2 += line_size;
3142 return s;
3145 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3147 int s, i;
3149 s = 0;
3150 for(i=0;i<h;i++) {
3151 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3167 pix1 += line_size;
3168 pix2 += line_size;
3170 return s;
3173 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3175 int s, i;
3176 uint8_t *pix3 = pix2 + line_size;
3178 s = 0;
3179 for(i=0;i<h;i++) {
3180 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3196 pix1 += line_size;
3197 pix2 += line_size;
3198 pix3 += line_size;
3200 return s;
3203 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3205 int s, i;
3206 uint8_t *pix3 = pix2 + line_size;
3208 s = 0;
3209 for(i=0;i<h;i++) {
3210 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3226 pix1 += line_size;
3227 pix2 += line_size;
3228 pix3 += line_size;
3230 return s;
3233 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3235 int s, i;
3237 s = 0;
3238 for(i=0;i<h;i++) {
3239 s += abs(pix1[0] - pix2[0]);
3240 s += abs(pix1[1] - pix2[1]);
3241 s += abs(pix1[2] - pix2[2]);
3242 s += abs(pix1[3] - pix2[3]);
3243 s += abs(pix1[4] - pix2[4]);
3244 s += abs(pix1[5] - pix2[5]);
3245 s += abs(pix1[6] - pix2[6]);
3246 s += abs(pix1[7] - pix2[7]);
3247 pix1 += line_size;
3248 pix2 += line_size;
3250 return s;
3253 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3255 int s, i;
3257 s = 0;
3258 for(i=0;i<h;i++) {
3259 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3267 pix1 += line_size;
3268 pix2 += line_size;
3270 return s;
3273 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3275 int s, i;
3276 uint8_t *pix3 = pix2 + line_size;
3278 s = 0;
3279 for(i=0;i<h;i++) {
3280 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3288 pix1 += line_size;
3289 pix2 += line_size;
3290 pix3 += line_size;
3292 return s;
3295 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3297 int s, i;
3298 uint8_t *pix3 = pix2 + line_size;
3300 s = 0;
3301 for(i=0;i<h;i++) {
3302 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3310 pix1 += line_size;
3311 pix2 += line_size;
3312 pix3 += line_size;
3314 return s;
3317 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318 MpegEncContext *c = v;
3319 int score1=0;
3320 int score2=0;
3321 int x,y;
3323 for(y=0; y<h; y++){
3324 for(x=0; x<16; x++){
3325 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3327 if(y+1<h){
3328 for(x=0; x<15; x++){
3329 score2+= FFABS( s1[x ] - s1[x +stride]
3330 - s1[x+1] + s1[x+1+stride])
3331 -FFABS( s2[x ] - s2[x +stride]
3332 - s2[x+1] + s2[x+1+stride]);
3335 s1+= stride;
3336 s2+= stride;
3339 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340 else return score1 + FFABS(score2)*8;
3343 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344 MpegEncContext *c = v;
3345 int score1=0;
3346 int score2=0;
3347 int x,y;
3349 for(y=0; y<h; y++){
3350 for(x=0; x<8; x++){
3351 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3353 if(y+1<h){
3354 for(x=0; x<7; x++){
3355 score2+= FFABS( s1[x ] - s1[x +stride]
3356 - s1[x+1] + s1[x+1+stride])
3357 -FFABS( s2[x ] - s2[x +stride]
3358 - s2[x+1] + s2[x+1+stride]);
3361 s1+= stride;
3362 s2+= stride;
3365 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366 else return score1 + FFABS(score2)*8;
3369 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3370 int i;
3371 unsigned int sum=0;
3373 for(i=0; i<8*8; i++){
3374 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3375 int w= weight[i];
3376 b>>= RECON_SHIFT;
3377 assert(-512<b && b<512);
3379 sum += (w*b)*(w*b)>>4;
3381 return sum>>2;
3384 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3385 int i;
3387 for(i=0; i<8*8; i++){
3388 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3393 * permutes an 8x8 block.
3394 * @param block the block which will be permuted according to the given permutation vector
3395 * @param permutation the permutation vector
3396 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398 * (inverse) permutated to scantable order!
3400 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3402 int i;
3403 DCTELEM temp[64];
3405 if(last<=0) return;
3406 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3408 for(i=0; i<=last; i++){
3409 const int j= scantable[i];
3410 temp[j]= block[j];
3411 block[j]=0;
3414 for(i=0; i<=last; i++){
3415 const int j= scantable[i];
3416 const int perm_j= permutation[j];
3417 block[perm_j]= temp[j];
3421 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3422 return 0;
3425 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3426 int i;
3428 memset(cmp, 0, sizeof(void*)*6);
3430 for(i=0; i<6; i++){
3431 switch(type&0xFF){
3432 case FF_CMP_SAD:
3433 cmp[i]= c->sad[i];
3434 break;
3435 case FF_CMP_SATD:
3436 cmp[i]= c->hadamard8_diff[i];
3437 break;
3438 case FF_CMP_SSE:
3439 cmp[i]= c->sse[i];
3440 break;
3441 case FF_CMP_DCT:
3442 cmp[i]= c->dct_sad[i];
3443 break;
3444 case FF_CMP_DCT264:
3445 cmp[i]= c->dct264_sad[i];
3446 break;
3447 case FF_CMP_DCTMAX:
3448 cmp[i]= c->dct_max[i];
3449 break;
3450 case FF_CMP_PSNR:
3451 cmp[i]= c->quant_psnr[i];
3452 break;
3453 case FF_CMP_BIT:
3454 cmp[i]= c->bit[i];
3455 break;
3456 case FF_CMP_RD:
3457 cmp[i]= c->rd[i];
3458 break;
3459 case FF_CMP_VSAD:
3460 cmp[i]= c->vsad[i];
3461 break;
3462 case FF_CMP_VSSE:
3463 cmp[i]= c->vsse[i];
3464 break;
3465 case FF_CMP_ZERO:
3466 cmp[i]= zero_cmp;
3467 break;
3468 case FF_CMP_NSSE:
3469 cmp[i]= c->nsse[i];
3470 break;
3471 #if CONFIG_SNOW_ENCODER
3472 case FF_CMP_W53:
3473 cmp[i]= c->w53[i];
3474 break;
3475 case FF_CMP_W97:
3476 cmp[i]= c->w97[i];
3477 break;
3478 #endif
3479 default:
3480 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3485 static void clear_block_c(DCTELEM *block)
3487 memset(block, 0, sizeof(DCTELEM)*64);
3491 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3493 static void clear_blocks_c(DCTELEM *blocks)
3495 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3498 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3499 long i;
3500 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3501 long a = *(long*)(src+i);
3502 long b = *(long*)(dst+i);
3503 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3505 for(; i<w; i++)
3506 dst[i+0] += src[i+0];
3509 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3510 long i;
3511 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3512 long a = *(long*)(src1+i);
3513 long b = *(long*)(src2+i);
3514 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3516 for(; i<w; i++)
3517 dst[i] = src1[i]+src2[i];
3520 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3521 long i;
3522 #if !HAVE_FAST_UNALIGNED
3523 if((long)src2 & (sizeof(long)-1)){
3524 for(i=0; i+7<w; i+=8){
3525 dst[i+0] = src1[i+0]-src2[i+0];
3526 dst[i+1] = src1[i+1]-src2[i+1];
3527 dst[i+2] = src1[i+2]-src2[i+2];
3528 dst[i+3] = src1[i+3]-src2[i+3];
3529 dst[i+4] = src1[i+4]-src2[i+4];
3530 dst[i+5] = src1[i+5]-src2[i+5];
3531 dst[i+6] = src1[i+6]-src2[i+6];
3532 dst[i+7] = src1[i+7]-src2[i+7];
3534 }else
3535 #endif
3536 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3537 long a = *(long*)(src1+i);
3538 long b = *(long*)(src2+i);
3539 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3541 for(; i<w; i++)
3542 dst[i+0] = src1[i+0]-src2[i+0];
3545 static void add_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *diff, int w, int *left, int *left_top){
3546 int i;
3547 uint8_t l, lt;
3549 l= *left;
3550 lt= *left_top;
3552 for(i=0; i<w; i++){
3553 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3554 lt= src1[i];
3555 dst[i]= l;
3558 *left= l;
3559 *left_top= lt;
3562 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3563 int i;
3564 uint8_t l, lt;
3566 l= *left;
3567 lt= *left_top;
3569 for(i=0; i<w; i++){
3570 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3571 lt= src1[i];
3572 l= src2[i];
3573 dst[i]= l - pred;
3576 *left= l;
3577 *left_top= lt;
3580 #define BUTTERFLY2(o1,o2,i1,i2) \
3581 o1= (i1)+(i2);\
3582 o2= (i1)-(i2);
3584 #define BUTTERFLY1(x,y) \
3586 int a,b;\
3587 a= x;\
3588 b= y;\
3589 x= a+b;\
3590 y= a-b;\
3593 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3595 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3596 int i;
3597 int temp[64];
3598 int sum=0;
3600 assert(h==8);
3602 for(i=0; i<8; i++){
3603 //FIXME try pointer walks
3604 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3605 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3606 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3607 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3609 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3610 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3611 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3612 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3614 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3615 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3616 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3617 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3620 for(i=0; i<8; i++){
3621 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3622 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3623 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3624 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3626 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3627 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3628 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3629 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3631 sum +=
3632 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3633 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3634 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3635 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3637 #if 0
3638 static int maxi=0;
3639 if(sum>maxi){
3640 maxi=sum;
3641 printf("MAX:%d\n", maxi);
3643 #endif
3644 return sum;
3647 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3648 int i;
3649 int temp[64];
3650 int sum=0;
3652 assert(h==8);
3654 for(i=0; i<8; i++){
3655 //FIXME try pointer walks
3656 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3657 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3658 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3659 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3661 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3662 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3663 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3664 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3666 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3667 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3668 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3669 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3672 for(i=0; i<8; i++){
3673 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3674 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3675 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3676 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3678 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3679 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3680 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3681 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3683 sum +=
3684 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3685 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3686 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3687 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3690 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3692 return sum;
3695 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3696 MpegEncContext * const s= (MpegEncContext *)c;
3697 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3698 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3700 assert(h==8);
3702 s->dsp.diff_pixels(temp, src1, src2, stride);
3703 s->dsp.fdct(temp);
3704 return s->dsp.sum_abs_dctelem(temp);
3707 #if CONFIG_GPL
3708 #define DCT8_1D {\
3709 const int s07 = SRC(0) + SRC(7);\
3710 const int s16 = SRC(1) + SRC(6);\
3711 const int s25 = SRC(2) + SRC(5);\
3712 const int s34 = SRC(3) + SRC(4);\
3713 const int a0 = s07 + s34;\
3714 const int a1 = s16 + s25;\
3715 const int a2 = s07 - s34;\
3716 const int a3 = s16 - s25;\
3717 const int d07 = SRC(0) - SRC(7);\
3718 const int d16 = SRC(1) - SRC(6);\
3719 const int d25 = SRC(2) - SRC(5);\
3720 const int d34 = SRC(3) - SRC(4);\
3721 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3722 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3723 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3724 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3725 DST(0, a0 + a1 ) ;\
3726 DST(1, a4 + (a7>>2)) ;\
3727 DST(2, a2 + (a3>>1)) ;\
3728 DST(3, a5 + (a6>>2)) ;\
3729 DST(4, a0 - a1 ) ;\
3730 DST(5, a6 - (a5>>2)) ;\
3731 DST(6, (a2>>1) - a3 ) ;\
3732 DST(7, (a4>>2) - a7 ) ;\
3735 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3736 MpegEncContext * const s= (MpegEncContext *)c;
3737 DCTELEM dct[8][8];
3738 int i;
3739 int sum=0;
3741 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3743 #define SRC(x) dct[i][x]
3744 #define DST(x,v) dct[i][x]= v
3745 for( i = 0; i < 8; i++ )
3746 DCT8_1D
3747 #undef SRC
3748 #undef DST
3750 #define SRC(x) dct[x][i]
3751 #define DST(x,v) sum += FFABS(v)
3752 for( i = 0; i < 8; i++ )
3753 DCT8_1D
3754 #undef SRC
3755 #undef DST
3756 return sum;
3758 #endif
3760 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761 MpegEncContext * const s= (MpegEncContext *)c;
3762 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3763 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3764 int sum=0, i;
3766 assert(h==8);
3768 s->dsp.diff_pixels(temp, src1, src2, stride);
3769 s->dsp.fdct(temp);
3771 for(i=0; i<64; i++)
3772 sum= FFMAX(sum, FFABS(temp[i]));
3774 return sum;
3777 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3778 MpegEncContext * const s= (MpegEncContext *)c;
3779 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3780 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3782 int sum=0, i;
3784 assert(h==8);
3785 s->mb_intra=0;
3787 s->dsp.diff_pixels(temp, src1, src2, stride);
3789 memcpy(bak, temp, 64*sizeof(DCTELEM));
3791 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3792 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3793 ff_simple_idct(temp); //FIXME
3795 for(i=0; i<64; i++)
3796 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3798 return sum;
3801 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3802 MpegEncContext * const s= (MpegEncContext *)c;
3803 const uint8_t *scantable= s->intra_scantable.permutated;
3804 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3805 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3806 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3807 uint8_t * const bak= (uint8_t*)aligned_bak;
3808 int i, last, run, bits, level, distortion, start_i;
3809 const int esc_length= s->ac_esc_length;
3810 uint8_t * length;
3811 uint8_t * last_length;
3813 assert(h==8);
3815 for(i=0; i<8; i++){
3816 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3817 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3820 s->dsp.diff_pixels(temp, src1, src2, stride);
3822 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3824 bits=0;
3826 if (s->mb_intra) {
3827 start_i = 1;
3828 length = s->intra_ac_vlc_length;
3829 last_length= s->intra_ac_vlc_last_length;
3830 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3831 } else {
3832 start_i = 0;
3833 length = s->inter_ac_vlc_length;
3834 last_length= s->inter_ac_vlc_last_length;
3837 if(last>=start_i){
3838 run=0;
3839 for(i=start_i; i<last; i++){
3840 int j= scantable[i];
3841 level= temp[j];
3843 if(level){
3844 level+=64;
3845 if((level&(~127)) == 0){
3846 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3847 }else
3848 bits+= esc_length;
3849 run=0;
3850 }else
3851 run++;
3853 i= scantable[last];
3855 level= temp[i] + 64;
3857 assert(level - 64);
3859 if((level&(~127)) == 0){
3860 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3861 }else
3862 bits+= esc_length;
3866 if(last>=0){
3867 if(s->mb_intra)
3868 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3869 else
3870 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3873 s->dsp.idct_add(bak, stride, temp);
3875 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3877 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3880 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3881 MpegEncContext * const s= (MpegEncContext *)c;
3882 const uint8_t *scantable= s->intra_scantable.permutated;
3883 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3884 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3885 int i, last, run, bits, level, start_i;
3886 const int esc_length= s->ac_esc_length;
3887 uint8_t * length;
3888 uint8_t * last_length;
3890 assert(h==8);
3892 s->dsp.diff_pixels(temp, src1, src2, stride);
3894 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3896 bits=0;
3898 if (s->mb_intra) {
3899 start_i = 1;
3900 length = s->intra_ac_vlc_length;
3901 last_length= s->intra_ac_vlc_last_length;
3902 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3903 } else {
3904 start_i = 0;
3905 length = s->inter_ac_vlc_length;
3906 last_length= s->inter_ac_vlc_last_length;
3909 if(last>=start_i){
3910 run=0;
3911 for(i=start_i; i<last; i++){
3912 int j= scantable[i];
3913 level= temp[j];
3915 if(level){
3916 level+=64;
3917 if((level&(~127)) == 0){
3918 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3919 }else
3920 bits+= esc_length;
3921 run=0;
3922 }else
3923 run++;
3925 i= scantable[last];
3927 level= temp[i] + 64;
3929 assert(level - 64);
3931 if((level&(~127)) == 0){
3932 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3933 }else
3934 bits+= esc_length;
3937 return bits;
3940 #define VSAD_INTRA(size) \
3941 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3942 int score=0; \
3943 int x,y; \
3945 for(y=1; y<h; y++){ \
3946 for(x=0; x<size; x+=4){ \
3947 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3948 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3950 s+= stride; \
3953 return score; \
3955 VSAD_INTRA(8)
3956 VSAD_INTRA(16)
3958 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3959 int score=0;
3960 int x,y;
3962 for(y=1; y<h; y++){
3963 for(x=0; x<16; x++){
3964 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3966 s1+= stride;
3967 s2+= stride;
3970 return score;
3973 #define SQ(a) ((a)*(a))
3974 #define VSSE_INTRA(size) \
3975 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3976 int score=0; \
3977 int x,y; \
3979 for(y=1; y<h; y++){ \
3980 for(x=0; x<size; x+=4){ \
3981 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3982 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3984 s+= stride; \
3987 return score; \
3989 VSSE_INTRA(8)
3990 VSSE_INTRA(16)
3992 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3993 int score=0;
3994 int x,y;
3996 for(y=1; y<h; y++){
3997 for(x=0; x<16; x++){
3998 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4000 s1+= stride;
4001 s2+= stride;
4004 return score;
4007 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4008 int size){
4009 int score=0;
4010 int i;
4011 for(i=0; i<size; i++)
4012 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4013 return score;
4016 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4017 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4018 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4019 #if CONFIG_GPL
4020 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4021 #endif
4022 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4023 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4024 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4025 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4027 static void vector_fmul_c(float *dst, const float *src, int len){
4028 int i;
4029 for(i=0; i<len; i++)
4030 dst[i] *= src[i];
4033 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4034 int i;
4035 src1 += len-1;
4036 for(i=0; i<len; i++)
4037 dst[i] = src0[i] * src1[-i];
4040 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4041 int i;
4042 for(i=0; i<len; i++)
4043 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4046 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4047 int i,j;
4048 dst += len;
4049 win += len;
4050 src0+= len;
4051 for(i=-len, j=len-1; i<0; i++, j--) {
4052 float s0 = src0[i];
4053 float s1 = src1[j];
4054 float wi = win[i];
4055 float wj = win[j];
4056 dst[i] = s0*wj - s1*wi + add_bias;
4057 dst[j] = s0*wi + s1*wj + add_bias;
4061 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4062 int i;
4063 for(i=0; i<len; i++)
4064 dst[i] = src[i] * mul;
4067 static av_always_inline int float_to_int16_one(const float *src){
4068 int_fast32_t tmp = *(const int32_t*)src;
4069 if(tmp & 0xf0000){
4070 tmp = (0x43c0ffff - tmp)>>31;
4071 // is this faster on some gcc/cpu combinations?
4072 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4073 // else tmp = 0;
4075 return tmp - 0x8000;
4078 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4079 int i;
4080 for(i=0; i<len; i++)
4081 dst[i] = float_to_int16_one(src+i);
4084 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4085 int i,j,c;
4086 if(channels==2){
4087 for(i=0; i<len; i++){
4088 dst[2*i] = float_to_int16_one(src[0]+i);
4089 dst[2*i+1] = float_to_int16_one(src[1]+i);
4091 }else{
4092 for(c=0; c<channels; c++)
4093 for(i=0, j=c; i<len; i++, j+=channels)
4094 dst[j] = float_to_int16_one(src[c]+i);
4098 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4100 while (order--)
4101 *v1++ += *v2++;
4104 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4106 while (order--)
4107 *v1++ -= *v2++;
4110 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4112 int res = 0;
4114 while (order--)
4115 res += (*v1++ * *v2++) >> shift;
4117 return res;
4120 #define W0 2048
4121 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4122 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4123 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4124 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4125 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4126 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4127 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4129 static void wmv2_idct_row(short * b)
4131 int s1,s2;
4132 int a0,a1,a2,a3,a4,a5,a6,a7;
4133 /*step 1*/
4134 a1 = W1*b[1]+W7*b[7];
4135 a7 = W7*b[1]-W1*b[7];
4136 a5 = W5*b[5]+W3*b[3];
4137 a3 = W3*b[5]-W5*b[3];
4138 a2 = W2*b[2]+W6*b[6];
4139 a6 = W6*b[2]-W2*b[6];
4140 a0 = W0*b[0]+W0*b[4];
4141 a4 = W0*b[0]-W0*b[4];
4142 /*step 2*/
4143 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4144 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4145 /*step 3*/
4146 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4147 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4148 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4149 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4150 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4151 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4152 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4153 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4155 static void wmv2_idct_col(short * b)
4157 int s1,s2;
4158 int a0,a1,a2,a3,a4,a5,a6,a7;
4159 /*step 1, with extended precision*/
4160 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4161 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4162 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4163 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4164 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4165 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4166 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4167 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4168 /*step 2*/
4169 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4170 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4171 /*step 3*/
4172 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4173 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4174 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4175 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4177 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4178 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4179 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4180 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4182 void ff_wmv2_idct_c(short * block){
4183 int i;
4185 for(i=0;i<64;i+=8){
4186 wmv2_idct_row(block+i);
4188 for(i=0;i<8;i++){
4189 wmv2_idct_col(block+i);
4192 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4193 converted */
4194 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4196 ff_wmv2_idct_c(block);
4197 put_pixels_clamped_c(block, dest, line_size);
4199 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4201 ff_wmv2_idct_c(block);
4202 add_pixels_clamped_c(block, dest, line_size);
4204 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4206 j_rev_dct (block);
4207 put_pixels_clamped_c(block, dest, line_size);
4209 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4211 j_rev_dct (block);
4212 add_pixels_clamped_c(block, dest, line_size);
4215 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4217 j_rev_dct4 (block);
4218 put_pixels_clamped4_c(block, dest, line_size);
4220 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4222 j_rev_dct4 (block);
4223 add_pixels_clamped4_c(block, dest, line_size);
4226 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4228 j_rev_dct2 (block);
4229 put_pixels_clamped2_c(block, dest, line_size);
4231 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4233 j_rev_dct2 (block);
4234 add_pixels_clamped2_c(block, dest, line_size);
4237 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4239 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4241 dest[0] = cm[(block[0] + 4)>>3];
4243 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4245 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4247 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4250 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4252 /* init static data */
4253 void dsputil_static_init(void)
4255 int i;
4257 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4258 for(i=0;i<MAX_NEG_CROP;i++) {
4259 ff_cropTbl[i] = 0;
4260 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4263 for(i=0;i<512;i++) {
4264 ff_squareTbl[i] = (i - 256) * (i - 256);
4267 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4270 int ff_check_alignment(void){
4271 static int did_fail=0;
4272 DECLARE_ALIGNED_16(int, aligned);
4274 if((long)&aligned & 15){
4275 if(!did_fail){
4276 #if HAVE_MMX || HAVE_ALTIVEC
4277 av_log(NULL, AV_LOG_ERROR,
4278 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4279 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4280 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4281 "Do not report crashes to FFmpeg developers.\n");
4282 #endif
4283 did_fail=1;
4285 return -1;
4287 return 0;
4290 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4292 int i;
4294 ff_check_alignment();
4296 #if CONFIG_ENCODERS
4297 if(avctx->dct_algo==FF_DCT_FASTINT) {
4298 c->fdct = fdct_ifast;
4299 c->fdct248 = fdct_ifast248;
4301 else if(avctx->dct_algo==FF_DCT_FAAN) {
4302 c->fdct = ff_faandct;
4303 c->fdct248 = ff_faandct248;
4305 else {
4306 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4307 c->fdct248 = ff_fdct248_islow;
4309 #endif //CONFIG_ENCODERS
4311 if(avctx->lowres==1){
4312 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4313 c->idct_put= ff_jref_idct4_put;
4314 c->idct_add= ff_jref_idct4_add;
4315 }else{
4316 c->idct_put= ff_h264_lowres_idct_put_c;
4317 c->idct_add= ff_h264_lowres_idct_add_c;
4319 c->idct = j_rev_dct4;
4320 c->idct_permutation_type= FF_NO_IDCT_PERM;
4321 }else if(avctx->lowres==2){
4322 c->idct_put= ff_jref_idct2_put;
4323 c->idct_add= ff_jref_idct2_add;
4324 c->idct = j_rev_dct2;
4325 c->idct_permutation_type= FF_NO_IDCT_PERM;
4326 }else if(avctx->lowres==3){
4327 c->idct_put= ff_jref_idct1_put;
4328 c->idct_add= ff_jref_idct1_add;
4329 c->idct = j_rev_dct1;
4330 c->idct_permutation_type= FF_NO_IDCT_PERM;
4331 }else{
4332 if(avctx->idct_algo==FF_IDCT_INT){
4333 c->idct_put= ff_jref_idct_put;
4334 c->idct_add= ff_jref_idct_add;
4335 c->idct = j_rev_dct;
4336 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4337 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
4338 avctx->idct_algo==FF_IDCT_VP3){
4339 c->idct_put= ff_vp3_idct_put_c;
4340 c->idct_add= ff_vp3_idct_add_c;
4341 c->idct = ff_vp3_idct_c;
4342 c->idct_permutation_type= FF_NO_IDCT_PERM;
4343 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4344 c->idct_put= ff_wmv2_idct_put_c;
4345 c->idct_add= ff_wmv2_idct_add_c;
4346 c->idct = ff_wmv2_idct_c;
4347 c->idct_permutation_type= FF_NO_IDCT_PERM;
4348 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4349 c->idct_put= ff_faanidct_put;
4350 c->idct_add= ff_faanidct_add;
4351 c->idct = ff_faanidct;
4352 c->idct_permutation_type= FF_NO_IDCT_PERM;
4353 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4354 c->idct_put= ff_ea_idct_put_c;
4355 c->idct_permutation_type= FF_NO_IDCT_PERM;
4356 }else{ //accurate/default
4357 c->idct_put= ff_simple_idct_put;
4358 c->idct_add= ff_simple_idct_add;
4359 c->idct = ff_simple_idct;
4360 c->idct_permutation_type= FF_NO_IDCT_PERM;
4364 if (CONFIG_H264_DECODER) {
4365 c->h264_idct_add= ff_h264_idct_add_c;
4366 c->h264_idct8_add= ff_h264_idct8_add_c;
4367 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4368 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4369 c->h264_idct_add16 = ff_h264_idct_add16_c;
4370 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4371 c->h264_idct_add8 = ff_h264_idct_add8_c;
4372 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4375 c->get_pixels = get_pixels_c;
4376 c->diff_pixels = diff_pixels_c;
4377 c->put_pixels_clamped = put_pixels_clamped_c;
4378 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4379 c->add_pixels_clamped = add_pixels_clamped_c;
4380 c->add_pixels8 = add_pixels8_c;
4381 c->add_pixels4 = add_pixels4_c;
4382 c->sum_abs_dctelem = sum_abs_dctelem_c;
4383 c->gmc1 = gmc1_c;
4384 c->gmc = ff_gmc_c;
4385 c->clear_block = clear_block_c;
4386 c->clear_blocks = clear_blocks_c;
4387 c->pix_sum = pix_sum_c;
4388 c->pix_norm1 = pix_norm1_c;
4390 /* TODO [0] 16 [1] 8 */
4391 c->pix_abs[0][0] = pix_abs16_c;
4392 c->pix_abs[0][1] = pix_abs16_x2_c;
4393 c->pix_abs[0][2] = pix_abs16_y2_c;
4394 c->pix_abs[0][3] = pix_abs16_xy2_c;
4395 c->pix_abs[1][0] = pix_abs8_c;
4396 c->pix_abs[1][1] = pix_abs8_x2_c;
4397 c->pix_abs[1][2] = pix_abs8_y2_c;
4398 c->pix_abs[1][3] = pix_abs8_xy2_c;
4400 #define dspfunc(PFX, IDX, NUM) \
4401 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4402 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4403 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4404 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4406 dspfunc(put, 0, 16);
4407 dspfunc(put_no_rnd, 0, 16);
4408 dspfunc(put, 1, 8);
4409 dspfunc(put_no_rnd, 1, 8);
4410 dspfunc(put, 2, 4);
4411 dspfunc(put, 3, 2);
4413 dspfunc(avg, 0, 16);
4414 dspfunc(avg_no_rnd, 0, 16);
4415 dspfunc(avg, 1, 8);
4416 dspfunc(avg_no_rnd, 1, 8);
4417 dspfunc(avg, 2, 4);
4418 dspfunc(avg, 3, 2);
4419 #undef dspfunc
4421 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4422 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4424 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4425 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4426 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4427 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4428 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4429 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4430 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4431 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4432 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4434 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4435 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4436 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4437 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4438 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4439 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4440 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4441 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4442 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4444 #define dspfunc(PFX, IDX, NUM) \
4445 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4446 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4447 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4448 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4449 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4450 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4451 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4452 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4453 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4454 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4455 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4456 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4457 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4458 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4459 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4460 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4462 dspfunc(put_qpel, 0, 16);
4463 dspfunc(put_no_rnd_qpel, 0, 16);
4465 dspfunc(avg_qpel, 0, 16);
4466 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4468 dspfunc(put_qpel, 1, 8);
4469 dspfunc(put_no_rnd_qpel, 1, 8);
4471 dspfunc(avg_qpel, 1, 8);
4472 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4474 dspfunc(put_h264_qpel, 0, 16);
4475 dspfunc(put_h264_qpel, 1, 8);
4476 dspfunc(put_h264_qpel, 2, 4);
4477 dspfunc(put_h264_qpel, 3, 2);
4478 dspfunc(avg_h264_qpel, 0, 16);
4479 dspfunc(avg_h264_qpel, 1, 8);
4480 dspfunc(avg_h264_qpel, 2, 4);
4482 #undef dspfunc
4483 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4484 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4485 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4486 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4487 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4488 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4489 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4491 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4492 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4493 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4494 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4495 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4496 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4497 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4498 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4499 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4500 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4501 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4502 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4503 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4504 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4505 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4506 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4507 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4508 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4509 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4510 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4512 c->draw_edges = draw_edges_c;
4514 #if CONFIG_CAVS_DECODER
4515 ff_cavsdsp_init(c,avctx);
4516 #endif
4517 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4518 ff_vc1dsp_init(c,avctx);
4519 #endif
4520 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4521 ff_intrax8dsp_init(c,avctx);
4522 #endif
4523 #if CONFIG_RV30_DECODER
4524 ff_rv30dsp_init(c,avctx);
4525 #endif
4526 #if CONFIG_RV40_DECODER
4527 ff_rv40dsp_init(c,avctx);
4528 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4529 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4530 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4531 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4532 #endif
4534 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4535 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4536 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4537 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4538 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4539 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4540 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4541 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4543 #define SET_CMP_FUNC(name) \
4544 c->name[0]= name ## 16_c;\
4545 c->name[1]= name ## 8x8_c;
4547 SET_CMP_FUNC(hadamard8_diff)
4548 c->hadamard8_diff[4]= hadamard8_intra16_c;
4549 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4550 SET_CMP_FUNC(dct_sad)
4551 SET_CMP_FUNC(dct_max)
4552 #if CONFIG_GPL
4553 SET_CMP_FUNC(dct264_sad)
4554 #endif
4555 c->sad[0]= pix_abs16_c;
4556 c->sad[1]= pix_abs8_c;
4557 c->sse[0]= sse16_c;
4558 c->sse[1]= sse8_c;
4559 c->sse[2]= sse4_c;
4560 SET_CMP_FUNC(quant_psnr)
4561 SET_CMP_FUNC(rd)
4562 SET_CMP_FUNC(bit)
4563 c->vsad[0]= vsad16_c;
4564 c->vsad[4]= vsad_intra16_c;
4565 c->vsad[5]= vsad_intra8_c;
4566 c->vsse[0]= vsse16_c;
4567 c->vsse[4]= vsse_intra16_c;
4568 c->vsse[5]= vsse_intra8_c;
4569 c->nsse[0]= nsse16_c;
4570 c->nsse[1]= nsse8_c;
4571 #if CONFIG_SNOW_ENCODER
4572 c->w53[0]= w53_16_c;
4573 c->w53[1]= w53_8_c;
4574 c->w97[0]= w97_16_c;
4575 c->w97[1]= w97_8_c;
4576 #endif
4578 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4580 c->add_bytes= add_bytes_c;
4581 c->add_bytes_l2= add_bytes_l2_c;
4582 c->diff_bytes= diff_bytes_c;
4583 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4584 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4585 c->bswap_buf= bswap_buf;
4586 #if CONFIG_PNG_DECODER
4587 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4588 #endif
4590 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4591 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4592 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4593 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4594 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4595 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4596 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4597 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4598 c->h264_loop_filter_strength= NULL;
4600 if (CONFIG_ANY_H263) {
4601 c->h263_h_loop_filter= h263_h_loop_filter_c;
4602 c->h263_v_loop_filter= h263_v_loop_filter_c;
4605 if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
4606 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4607 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4609 if (CONFIG_VP6_DECODER) {
4610 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4613 c->h261_loop_filter= h261_loop_filter_c;
4615 c->try_8x8basis= try_8x8basis_c;
4616 c->add_8x8basis= add_8x8basis_c;
4618 #if CONFIG_SNOW_DECODER
4619 c->vertical_compose97i = ff_snow_vertical_compose97i;
4620 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4621 c->inner_add_yblock = ff_snow_inner_add_yblock;
4622 #endif
4624 #if CONFIG_VORBIS_DECODER
4625 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4626 #endif
4627 #if CONFIG_AC3_DECODER
4628 c->ac3_downmix = ff_ac3_downmix_c;
4629 #endif
4630 #if CONFIG_FLAC_ENCODER
4631 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4632 #endif
4633 c->vector_fmul = vector_fmul_c;
4634 c->vector_fmul_reverse = vector_fmul_reverse_c;
4635 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4636 c->vector_fmul_window = ff_vector_fmul_window_c;
4637 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4638 c->float_to_int16 = ff_float_to_int16_c;
4639 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4640 c->add_int16 = add_int16_c;
4641 c->sub_int16 = sub_int16_c;
4642 c->scalarproduct_int16 = scalarproduct_int16_c;
4644 c->shrink[0]= ff_img_copy_plane;
4645 c->shrink[1]= ff_shrink22;
4646 c->shrink[2]= ff_shrink44;
4647 c->shrink[3]= ff_shrink88;
4649 c->prefetch= just_return;
4651 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4652 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4654 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4655 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4656 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4657 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4658 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4659 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4660 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4661 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4662 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4664 for(i=0; i<64; i++){
4665 if(!c->put_2tap_qpel_pixels_tab[0][i])
4666 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4667 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4668 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4671 switch(c->idct_permutation_type){
4672 case FF_NO_IDCT_PERM:
4673 for(i=0; i<64; i++)
4674 c->idct_permutation[i]= i;
4675 break;
4676 case FF_LIBMPEG2_IDCT_PERM:
4677 for(i=0; i<64; i++)
4678 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4679 break;
4680 case FF_SIMPLE_IDCT_PERM:
4681 for(i=0; i<64; i++)
4682 c->idct_permutation[i]= simple_mmx_permutation[i];
4683 break;
4684 case FF_TRANSPOSE_IDCT_PERM:
4685 for(i=0; i<64; i++)
4686 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4687 break;
4688 case FF_PARTTRANS_IDCT_PERM:
4689 for(i=0; i<64; i++)
4690 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4691 break;
4692 case FF_SSE2_IDCT_PERM:
4693 for(i=0; i<64; i++)
4694 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4695 break;
4696 default:
4697 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");