Use tables symetry to reduce their size by half.
[ffmpeg-lucabe.git] / libavcodec / dsputil.c
blob241bad0d4f080569bce84a7836ff649ee777b997
1 /*
2 * DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 /**
26 * @file dsputil.c
27 * DSP utils
30 #include "avcodec.h"
31 #include "dsputil.h"
32 #include "simple_idct.h"
33 #include "faandct.h"
34 #include "faanidct.h"
35 #include "h263.h"
36 #include "snow.h"
38 /* snow.c */
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 /* vorbis.c */
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 /* ac3dec.c */
45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
47 /* flacenc.c */
48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
50 /* pngdec.c */
51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
53 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
54 uint32_t ff_squareTbl[512] = {0, };
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL/255 * 0x7f)
58 #define pb_80 (~0UL/255 * 0x80)
60 const uint8_t ff_zigzag_direct[64] = {
61 0, 1, 8, 16, 9, 2, 3, 10,
62 17, 24, 32, 25, 18, 11, 4, 5,
63 12, 19, 26, 33, 40, 48, 41, 34,
64 27, 20, 13, 6, 7, 14, 21, 28,
65 35, 42, 49, 56, 57, 50, 43, 36,
66 29, 22, 15, 23, 30, 37, 44, 51,
67 58, 59, 52, 45, 38, 31, 39, 46,
68 53, 60, 61, 54, 47, 55, 62, 63
71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
72 specification, we interleave the fields */
73 const uint8_t ff_zigzag248_direct[64] = {
74 0, 8, 1, 9, 16, 24, 2, 10,
75 17, 25, 32, 40, 48, 56, 33, 41,
76 18, 26, 3, 11, 4, 12, 19, 27,
77 34, 42, 49, 57, 50, 58, 35, 43,
78 20, 28, 5, 13, 6, 14, 21, 29,
79 36, 44, 51, 59, 52, 60, 37, 45,
80 22, 30, 7, 15, 23, 31, 38, 46,
81 53, 61, 54, 62, 39, 47, 55, 63,
84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
87 const uint8_t ff_alternate_horizontal_scan[64] = {
88 0, 1, 2, 3, 8, 9, 16, 17,
89 10, 11, 4, 5, 6, 7, 15, 14,
90 13, 12, 19, 18, 24, 25, 32, 33,
91 26, 27, 20, 21, 22, 23, 28, 29,
92 30, 31, 34, 35, 40, 41, 48, 49,
93 42, 43, 36, 37, 38, 39, 44, 45,
94 46, 47, 50, 51, 56, 57, 58, 59,
95 52, 53, 54, 55, 60, 61, 62, 63,
98 const uint8_t ff_alternate_vertical_scan[64] = {
99 0, 8, 16, 24, 1, 9, 2, 10,
100 17, 25, 32, 40, 48, 56, 57, 49,
101 41, 33, 26, 18, 3, 11, 4, 12,
102 19, 27, 34, 42, 50, 58, 35, 43,
103 51, 59, 20, 28, 5, 13, 6, 14,
104 21, 29, 36, 44, 52, 60, 37, 45,
105 53, 61, 22, 30, 7, 15, 23, 31,
106 38, 46, 54, 62, 39, 47, 55, 63,
109 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
110 const uint32_t ff_inverse[256]={
111 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
112 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
113 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
114 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
115 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
116 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
117 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
118 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
119 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
120 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
121 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
122 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
123 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
124 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
125 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
126 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
127 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
128 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
129 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
130 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
131 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
132 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
133 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
134 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
135 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
136 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
137 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
138 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
139 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
140 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
141 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
142 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
145 /* Input permutation for the simple_idct_mmx */
146 static const uint8_t simple_mmx_permutation[64]={
147 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
148 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
149 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
150 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
151 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
152 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
153 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
154 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
157 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
159 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
160 int i;
161 int end;
163 st->scantable= src_scantable;
165 for(i=0; i<64; i++){
166 int j;
167 j = src_scantable[i];
168 st->permutated[i] = permutation[j];
169 #ifdef ARCH_POWERPC
170 st->inverse[j] = i;
171 #endif
174 end=-1;
175 for(i=0; i<64; i++){
176 int j;
177 j = st->permutated[i];
178 if(j>end) end=j;
179 st->raster_end[i]= end;
183 static int pix_sum_c(uint8_t * pix, int line_size)
185 int s, i, j;
187 s = 0;
188 for (i = 0; i < 16; i++) {
189 for (j = 0; j < 16; j += 8) {
190 s += pix[0];
191 s += pix[1];
192 s += pix[2];
193 s += pix[3];
194 s += pix[4];
195 s += pix[5];
196 s += pix[6];
197 s += pix[7];
198 pix += 8;
200 pix += line_size - 16;
202 return s;
205 static int pix_norm1_c(uint8_t * pix, int line_size)
207 int s, i, j;
208 uint32_t *sq = ff_squareTbl + 256;
210 s = 0;
211 for (i = 0; i < 16; i++) {
212 for (j = 0; j < 16; j += 8) {
213 #if 0
214 s += sq[pix[0]];
215 s += sq[pix[1]];
216 s += sq[pix[2]];
217 s += sq[pix[3]];
218 s += sq[pix[4]];
219 s += sq[pix[5]];
220 s += sq[pix[6]];
221 s += sq[pix[7]];
222 #else
223 #if LONG_MAX > 2147483647
224 register uint64_t x=*(uint64_t*)pix;
225 s += sq[x&0xff];
226 s += sq[(x>>8)&0xff];
227 s += sq[(x>>16)&0xff];
228 s += sq[(x>>24)&0xff];
229 s += sq[(x>>32)&0xff];
230 s += sq[(x>>40)&0xff];
231 s += sq[(x>>48)&0xff];
232 s += sq[(x>>56)&0xff];
233 #else
234 register uint32_t x=*(uint32_t*)pix;
235 s += sq[x&0xff];
236 s += sq[(x>>8)&0xff];
237 s += sq[(x>>16)&0xff];
238 s += sq[(x>>24)&0xff];
239 x=*(uint32_t*)(pix+4);
240 s += sq[x&0xff];
241 s += sq[(x>>8)&0xff];
242 s += sq[(x>>16)&0xff];
243 s += sq[(x>>24)&0xff];
244 #endif
245 #endif
246 pix += 8;
248 pix += line_size - 16;
250 return s;
253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
254 int i;
256 for(i=0; i+8<=w; i+=8){
257 dst[i+0]= bswap_32(src[i+0]);
258 dst[i+1]= bswap_32(src[i+1]);
259 dst[i+2]= bswap_32(src[i+2]);
260 dst[i+3]= bswap_32(src[i+3]);
261 dst[i+4]= bswap_32(src[i+4]);
262 dst[i+5]= bswap_32(src[i+5]);
263 dst[i+6]= bswap_32(src[i+6]);
264 dst[i+7]= bswap_32(src[i+7]);
266 for(;i<w; i++){
267 dst[i+0]= bswap_32(src[i+0]);
271 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
273 int s, i;
274 uint32_t *sq = ff_squareTbl + 256;
276 s = 0;
277 for (i = 0; i < h; i++) {
278 s += sq[pix1[0] - pix2[0]];
279 s += sq[pix1[1] - pix2[1]];
280 s += sq[pix1[2] - pix2[2]];
281 s += sq[pix1[3] - pix2[3]];
282 pix1 += line_size;
283 pix2 += line_size;
285 return s;
288 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
290 int s, i;
291 uint32_t *sq = ff_squareTbl + 256;
293 s = 0;
294 for (i = 0; i < h; i++) {
295 s += sq[pix1[0] - pix2[0]];
296 s += sq[pix1[1] - pix2[1]];
297 s += sq[pix1[2] - pix2[2]];
298 s += sq[pix1[3] - pix2[3]];
299 s += sq[pix1[4] - pix2[4]];
300 s += sq[pix1[5] - pix2[5]];
301 s += sq[pix1[6] - pix2[6]];
302 s += sq[pix1[7] - pix2[7]];
303 pix1 += line_size;
304 pix2 += line_size;
306 return s;
309 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
311 int s, i;
312 uint32_t *sq = ff_squareTbl + 256;
314 s = 0;
315 for (i = 0; i < h; i++) {
316 s += sq[pix1[ 0] - pix2[ 0]];
317 s += sq[pix1[ 1] - pix2[ 1]];
318 s += sq[pix1[ 2] - pix2[ 2]];
319 s += sq[pix1[ 3] - pix2[ 3]];
320 s += sq[pix1[ 4] - pix2[ 4]];
321 s += sq[pix1[ 5] - pix2[ 5]];
322 s += sq[pix1[ 6] - pix2[ 6]];
323 s += sq[pix1[ 7] - pix2[ 7]];
324 s += sq[pix1[ 8] - pix2[ 8]];
325 s += sq[pix1[ 9] - pix2[ 9]];
326 s += sq[pix1[10] - pix2[10]];
327 s += sq[pix1[11] - pix2[11]];
328 s += sq[pix1[12] - pix2[12]];
329 s += sq[pix1[13] - pix2[13]];
330 s += sq[pix1[14] - pix2[14]];
331 s += sq[pix1[15] - pix2[15]];
333 pix1 += line_size;
334 pix2 += line_size;
336 return s;
340 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
341 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
342 int s, i, j;
343 const int dec_count= w==8 ? 3 : 4;
344 int tmp[32*32];
345 int level, ori;
346 static const int scale[2][2][4][4]={
349 // 9/7 8x8 dec=3
350 {268, 239, 239, 213},
351 { 0, 224, 224, 152},
352 { 0, 135, 135, 110},
354 // 9/7 16x16 or 32x32 dec=4
355 {344, 310, 310, 280},
356 { 0, 320, 320, 228},
357 { 0, 175, 175, 136},
358 { 0, 129, 129, 102},
362 // 5/3 8x8 dec=3
363 {275, 245, 245, 218},
364 { 0, 230, 230, 156},
365 { 0, 138, 138, 113},
367 // 5/3 16x16 or 32x32 dec=4
368 {352, 317, 317, 286},
369 { 0, 328, 328, 233},
370 { 0, 180, 180, 140},
371 { 0, 132, 132, 105},
376 for (i = 0; i < h; i++) {
377 for (j = 0; j < w; j+=4) {
378 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
379 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
380 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
381 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
383 pix1 += line_size;
384 pix2 += line_size;
387 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
389 s=0;
390 assert(w==h);
391 for(level=0; level<dec_count; level++){
392 for(ori= level ? 1 : 0; ori<4; ori++){
393 int size= w>>(dec_count-level);
394 int sx= (ori&1) ? size : 0;
395 int stride= 32<<(dec_count-level);
396 int sy= (ori&2) ? stride>>1 : 0;
398 for(i=0; i<size; i++){
399 for(j=0; j<size; j++){
400 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
401 s += FFABS(v);
406 assert(s>=0);
407 return s>>9;
410 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
411 return w_c(v, pix1, pix2, line_size, 8, h, 1);
414 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 0);
418 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 16, h, 1);
422 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 0);
426 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 32, h, 1);
430 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 0);
433 #endif
435 /* draw the edges of width 'w' of an image of size width, height */
436 //FIXME check that this is ok for mpeg4 interlaced
437 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
439 uint8_t *ptr, *last_line;
440 int i;
442 last_line = buf + (height - 1) * wrap;
443 for(i=0;i<w;i++) {
444 /* top and bottom */
445 memcpy(buf - (i + 1) * wrap, buf, width);
446 memcpy(last_line + (i + 1) * wrap, last_line, width);
448 /* left and right */
449 ptr = buf;
450 for(i=0;i<height;i++) {
451 memset(ptr - w, ptr[0], w);
452 memset(ptr + width, ptr[width-1], w);
453 ptr += wrap;
455 /* corners */
456 for(i=0;i<w;i++) {
457 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
458 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
459 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
460 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
465 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
466 * @param buf destination buffer
467 * @param src source buffer
468 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
469 * @param block_w width of block
470 * @param block_h height of block
471 * @param src_x x coordinate of the top left sample of the block in the source buffer
472 * @param src_y y coordinate of the top left sample of the block in the source buffer
473 * @param w width of the source buffer
474 * @param h height of the source buffer
476 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
477 int src_x, int src_y, int w, int h){
478 int x, y;
479 int start_y, start_x, end_y, end_x;
481 if(src_y>= h){
482 src+= (h-1-src_y)*linesize;
483 src_y=h-1;
484 }else if(src_y<=-block_h){
485 src+= (1-block_h-src_y)*linesize;
486 src_y=1-block_h;
488 if(src_x>= w){
489 src+= (w-1-src_x);
490 src_x=w-1;
491 }else if(src_x<=-block_w){
492 src+= (1-block_w-src_x);
493 src_x=1-block_w;
496 start_y= FFMAX(0, -src_y);
497 start_x= FFMAX(0, -src_x);
498 end_y= FFMIN(block_h, h-src_y);
499 end_x= FFMIN(block_w, w-src_x);
501 // copy existing part
502 for(y=start_y; y<end_y; y++){
503 for(x=start_x; x<end_x; x++){
504 buf[x + y*linesize]= src[x + y*linesize];
508 //top
509 for(y=0; y<start_y; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= buf[x + start_y*linesize];
515 //bottom
516 for(y=end_y; y<block_h; y++){
517 for(x=start_x; x<end_x; x++){
518 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
522 for(y=0; y<block_h; y++){
523 //left
524 for(x=0; x<start_x; x++){
525 buf[x + y*linesize]= buf[start_x + y*linesize];
528 //right
529 for(x=end_x; x<block_w; x++){
530 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
535 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
537 int i;
539 /* read the pixels */
540 for(i=0;i<8;i++) {
541 block[0] = pixels[0];
542 block[1] = pixels[1];
543 block[2] = pixels[2];
544 block[3] = pixels[3];
545 block[4] = pixels[4];
546 block[5] = pixels[5];
547 block[6] = pixels[6];
548 block[7] = pixels[7];
549 pixels += line_size;
550 block += 8;
554 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
555 const uint8_t *s2, int stride){
556 int i;
558 /* read the pixels */
559 for(i=0;i<8;i++) {
560 block[0] = s1[0] - s2[0];
561 block[1] = s1[1] - s2[1];
562 block[2] = s1[2] - s2[2];
563 block[3] = s1[3] - s2[3];
564 block[4] = s1[4] - s2[4];
565 block[5] = s1[5] - s2[5];
566 block[6] = s1[6] - s2[6];
567 block[7] = s1[7] - s2[7];
568 s1 += stride;
569 s2 += stride;
570 block += 8;
575 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
576 int line_size)
578 int i;
579 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
581 /* read the pixels */
582 for(i=0;i<8;i++) {
583 pixels[0] = cm[block[0]];
584 pixels[1] = cm[block[1]];
585 pixels[2] = cm[block[2]];
586 pixels[3] = cm[block[3]];
587 pixels[4] = cm[block[4]];
588 pixels[5] = cm[block[5]];
589 pixels[6] = cm[block[6]];
590 pixels[7] = cm[block[7]];
592 pixels += line_size;
593 block += 8;
597 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
598 int line_size)
600 int i;
601 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
603 /* read the pixels */
604 for(i=0;i<4;i++) {
605 pixels[0] = cm[block[0]];
606 pixels[1] = cm[block[1]];
607 pixels[2] = cm[block[2]];
608 pixels[3] = cm[block[3]];
610 pixels += line_size;
611 block += 8;
615 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
616 int line_size)
618 int i;
619 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
621 /* read the pixels */
622 for(i=0;i<2;i++) {
623 pixels[0] = cm[block[0]];
624 pixels[1] = cm[block[1]];
626 pixels += line_size;
627 block += 8;
631 static void put_signed_pixels_clamped_c(const DCTELEM *block,
632 uint8_t *restrict pixels,
633 int line_size)
635 int i, j;
637 for (i = 0; i < 8; i++) {
638 for (j = 0; j < 8; j++) {
639 if (*block < -128)
640 *pixels = 0;
641 else if (*block > 127)
642 *pixels = 255;
643 else
644 *pixels = (uint8_t)(*block + 128);
645 block++;
646 pixels++;
648 pixels += (line_size - 8);
652 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
653 int line_size)
655 int i;
656 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
658 /* read the pixels */
659 for(i=0;i<8;i++) {
660 pixels[0] = cm[pixels[0] + block[0]];
661 pixels[1] = cm[pixels[1] + block[1]];
662 pixels[2] = cm[pixels[2] + block[2]];
663 pixels[3] = cm[pixels[3] + block[3]];
664 pixels[4] = cm[pixels[4] + block[4]];
665 pixels[5] = cm[pixels[5] + block[5]];
666 pixels[6] = cm[pixels[6] + block[6]];
667 pixels[7] = cm[pixels[7] + block[7]];
668 pixels += line_size;
669 block += 8;
673 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
674 int line_size)
676 int i;
677 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
679 /* read the pixels */
680 for(i=0;i<4;i++) {
681 pixels[0] = cm[pixels[0] + block[0]];
682 pixels[1] = cm[pixels[1] + block[1]];
683 pixels[2] = cm[pixels[2] + block[2]];
684 pixels[3] = cm[pixels[3] + block[3]];
685 pixels += line_size;
686 block += 8;
690 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
691 int line_size)
693 int i;
694 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
696 /* read the pixels */
697 for(i=0;i<2;i++) {
698 pixels[0] = cm[pixels[0] + block[0]];
699 pixels[1] = cm[pixels[1] + block[1]];
700 pixels += line_size;
701 block += 8;
705 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
707 int i;
708 for(i=0;i<8;i++) {
709 pixels[0] += block[0];
710 pixels[1] += block[1];
711 pixels[2] += block[2];
712 pixels[3] += block[3];
713 pixels[4] += block[4];
714 pixels[5] += block[5];
715 pixels[6] += block[6];
716 pixels[7] += block[7];
717 pixels += line_size;
718 block += 8;
722 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
724 int i;
725 for(i=0;i<4;i++) {
726 pixels[0] += block[0];
727 pixels[1] += block[1];
728 pixels[2] += block[2];
729 pixels[3] += block[3];
730 pixels += line_size;
731 block += 4;
735 static int sum_abs_dctelem_c(DCTELEM *block)
737 int sum=0, i;
738 for(i=0; i<64; i++)
739 sum+= FFABS(block[i]);
740 return sum;
743 #if 0
745 #define PIXOP2(OPNAME, OP) \
746 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
748 int i;\
749 for(i=0; i<h; i++){\
750 OP(*((uint64_t*)block), AV_RN64(pixels));\
751 pixels+=line_size;\
752 block +=line_size;\
756 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758 int i;\
759 for(i=0; i<h; i++){\
760 const uint64_t a= AV_RN64(pixels );\
761 const uint64_t b= AV_RN64(pixels+1);\
762 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
763 pixels+=line_size;\
764 block +=line_size;\
768 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
770 int i;\
771 for(i=0; i<h; i++){\
772 const uint64_t a= AV_RN64(pixels );\
773 const uint64_t b= AV_RN64(pixels+1);\
774 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
775 pixels+=line_size;\
776 block +=line_size;\
780 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
782 int i;\
783 for(i=0; i<h; i++){\
784 const uint64_t a= AV_RN64(pixels );\
785 const uint64_t b= AV_RN64(pixels+line_size);\
786 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
787 pixels+=line_size;\
788 block +=line_size;\
792 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
794 int i;\
795 for(i=0; i<h; i++){\
796 const uint64_t a= AV_RN64(pixels );\
797 const uint64_t b= AV_RN64(pixels+line_size);\
798 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
799 pixels+=line_size;\
800 block +=line_size;\
804 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
806 int i;\
807 const uint64_t a= AV_RN64(pixels );\
808 const uint64_t b= AV_RN64(pixels+1);\
809 uint64_t l0= (a&0x0303030303030303ULL)\
810 + (b&0x0303030303030303ULL)\
811 + 0x0202020202020202ULL;\
812 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
813 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
814 uint64_t l1,h1;\
816 pixels+=line_size;\
817 for(i=0; i<h; i+=2){\
818 uint64_t a= AV_RN64(pixels );\
819 uint64_t b= AV_RN64(pixels+1);\
820 l1= (a&0x0303030303030303ULL)\
821 + (b&0x0303030303030303ULL);\
822 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
823 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
825 pixels+=line_size;\
826 block +=line_size;\
827 a= AV_RN64(pixels );\
828 b= AV_RN64(pixels+1);\
829 l0= (a&0x0303030303030303ULL)\
830 + (b&0x0303030303030303ULL)\
831 + 0x0202020202020202ULL;\
832 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
833 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
834 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
835 pixels+=line_size;\
836 block +=line_size;\
840 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
842 int i;\
843 const uint64_t a= AV_RN64(pixels );\
844 const uint64_t b= AV_RN64(pixels+1);\
845 uint64_t l0= (a&0x0303030303030303ULL)\
846 + (b&0x0303030303030303ULL)\
847 + 0x0101010101010101ULL;\
848 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
849 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
850 uint64_t l1,h1;\
852 pixels+=line_size;\
853 for(i=0; i<h; i+=2){\
854 uint64_t a= AV_RN64(pixels );\
855 uint64_t b= AV_RN64(pixels+1);\
856 l1= (a&0x0303030303030303ULL)\
857 + (b&0x0303030303030303ULL);\
858 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
859 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
860 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
861 pixels+=line_size;\
862 block +=line_size;\
863 a= AV_RN64(pixels );\
864 b= AV_RN64(pixels+1);\
865 l0= (a&0x0303030303030303ULL)\
866 + (b&0x0303030303030303ULL)\
867 + 0x0101010101010101ULL;\
868 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
869 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
870 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
871 pixels+=line_size;\
872 block +=line_size;\
876 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
877 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
878 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
880 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
884 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
885 #else // 64 bit variant
887 #define PIXOP2(OPNAME, OP) \
888 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
889 int i;\
890 for(i=0; i<h; i++){\
891 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
892 pixels+=line_size;\
893 block +=line_size;\
896 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897 int i;\
898 for(i=0; i<h; i++){\
899 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
900 pixels+=line_size;\
901 block +=line_size;\
904 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905 int i;\
906 for(i=0; i<h; i++){\
907 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
908 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
909 pixels+=line_size;\
910 block +=line_size;\
913 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
917 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
918 int src_stride1, int src_stride2, int h){\
919 int i;\
920 for(i=0; i<h; i++){\
921 uint32_t a,b;\
922 a= AV_RN32(&src1[i*src_stride1 ]);\
923 b= AV_RN32(&src2[i*src_stride2 ]);\
924 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
925 a= AV_RN32(&src1[i*src_stride1+4]);\
926 b= AV_RN32(&src2[i*src_stride2+4]);\
927 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
931 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
932 int src_stride1, int src_stride2, int h){\
933 int i;\
934 for(i=0; i<h; i++){\
935 uint32_t a,b;\
936 a= AV_RN32(&src1[i*src_stride1 ]);\
937 b= AV_RN32(&src2[i*src_stride2 ]);\
938 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
939 a= AV_RN32(&src1[i*src_stride1+4]);\
940 b= AV_RN32(&src2[i*src_stride2+4]);\
941 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
945 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
946 int src_stride1, int src_stride2, int h){\
947 int i;\
948 for(i=0; i<h; i++){\
949 uint32_t a,b;\
950 a= AV_RN32(&src1[i*src_stride1 ]);\
951 b= AV_RN32(&src2[i*src_stride2 ]);\
952 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
956 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
957 int src_stride1, int src_stride2, int h){\
958 int i;\
959 for(i=0; i<h; i++){\
960 uint32_t a,b;\
961 a= AV_RN16(&src1[i*src_stride1 ]);\
962 b= AV_RN16(&src2[i*src_stride2 ]);\
963 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
967 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
968 int src_stride1, int src_stride2, int h){\
969 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
970 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
973 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
974 int src_stride1, int src_stride2, int h){\
975 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
976 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
979 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
980 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
983 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
991 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
995 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
996 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
997 int i;\
998 for(i=0; i<h; i++){\
999 uint32_t a, b, c, d, l0, l1, h0, h1;\
1000 a= AV_RN32(&src1[i*src_stride1]);\
1001 b= AV_RN32(&src2[i*src_stride2]);\
1002 c= AV_RN32(&src3[i*src_stride3]);\
1003 d= AV_RN32(&src4[i*src_stride4]);\
1004 l0= (a&0x03030303UL)\
1005 + (b&0x03030303UL)\
1006 + 0x02020202UL;\
1007 h0= ((a&0xFCFCFCFCUL)>>2)\
1008 + ((b&0xFCFCFCFCUL)>>2);\
1009 l1= (c&0x03030303UL)\
1010 + (d&0x03030303UL);\
1011 h1= ((c&0xFCFCFCFCUL)>>2)\
1012 + ((d&0xFCFCFCFCUL)>>2);\
1013 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 a= AV_RN32(&src1[i*src_stride1+4]);\
1015 b= AV_RN32(&src2[i*src_stride2+4]);\
1016 c= AV_RN32(&src3[i*src_stride3+4]);\
1017 d= AV_RN32(&src4[i*src_stride4+4]);\
1018 l0= (a&0x03030303UL)\
1019 + (b&0x03030303UL)\
1020 + 0x02020202UL;\
1021 h0= ((a&0xFCFCFCFCUL)>>2)\
1022 + ((b&0xFCFCFCFCUL)>>2);\
1023 l1= (c&0x03030303UL)\
1024 + (d&0x03030303UL);\
1025 h1= ((c&0xFCFCFCFCUL)>>2)\
1026 + ((d&0xFCFCFCFCUL)>>2);\
1027 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1031 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1032 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1035 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1039 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1043 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1047 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1048 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1049 int i;\
1050 for(i=0; i<h; i++){\
1051 uint32_t a, b, c, d, l0, l1, h0, h1;\
1052 a= AV_RN32(&src1[i*src_stride1]);\
1053 b= AV_RN32(&src2[i*src_stride2]);\
1054 c= AV_RN32(&src3[i*src_stride3]);\
1055 d= AV_RN32(&src4[i*src_stride4]);\
1056 l0= (a&0x03030303UL)\
1057 + (b&0x03030303UL)\
1058 + 0x01010101UL;\
1059 h0= ((a&0xFCFCFCFCUL)>>2)\
1060 + ((b&0xFCFCFCFCUL)>>2);\
1061 l1= (c&0x03030303UL)\
1062 + (d&0x03030303UL);\
1063 h1= ((c&0xFCFCFCFCUL)>>2)\
1064 + ((d&0xFCFCFCFCUL)>>2);\
1065 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1066 a= AV_RN32(&src1[i*src_stride1+4]);\
1067 b= AV_RN32(&src2[i*src_stride2+4]);\
1068 c= AV_RN32(&src3[i*src_stride3+4]);\
1069 d= AV_RN32(&src4[i*src_stride4+4]);\
1070 l0= (a&0x03030303UL)\
1071 + (b&0x03030303UL)\
1072 + 0x01010101UL;\
1073 h0= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1075 l1= (c&0x03030303UL)\
1076 + (d&0x03030303UL);\
1077 h1= ((c&0xFCFCFCFCUL)>>2)\
1078 + ((d&0xFCFCFCFCUL)>>2);\
1079 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1083 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1084 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1085 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1087 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1088 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1089 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1095 int i, a0, b0, a1, b1;\
1096 a0= pixels[0];\
1097 b0= pixels[1] + 2;\
1098 a0 += b0;\
1099 b0 += pixels[2];\
1101 pixels+=line_size;\
1102 for(i=0; i<h; i+=2){\
1103 a1= pixels[0];\
1104 b1= pixels[1];\
1105 a1 += b1;\
1106 b1 += pixels[2];\
1108 block[0]= (a1+a0)>>2; /* FIXME non put */\
1109 block[1]= (b1+b0)>>2;\
1111 pixels+=line_size;\
1112 block +=line_size;\
1114 a0= pixels[0];\
1115 b0= pixels[1] + 2;\
1116 a0 += b0;\
1117 b0 += pixels[2];\
1119 block[0]= (a1+a0)>>2;\
1120 block[1]= (b1+b0)>>2;\
1121 pixels+=line_size;\
1122 block +=line_size;\
1126 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1128 int i;\
1129 const uint32_t a= AV_RN32(pixels );\
1130 const uint32_t b= AV_RN32(pixels+1);\
1131 uint32_t l0= (a&0x03030303UL)\
1132 + (b&0x03030303UL)\
1133 + 0x02020202UL;\
1134 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1135 + ((b&0xFCFCFCFCUL)>>2);\
1136 uint32_t l1,h1;\
1138 pixels+=line_size;\
1139 for(i=0; i<h; i+=2){\
1140 uint32_t a= AV_RN32(pixels );\
1141 uint32_t b= AV_RN32(pixels+1);\
1142 l1= (a&0x03030303UL)\
1143 + (b&0x03030303UL);\
1144 h1= ((a&0xFCFCFCFCUL)>>2)\
1145 + ((b&0xFCFCFCFCUL)>>2);\
1146 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1147 pixels+=line_size;\
1148 block +=line_size;\
1149 a= AV_RN32(pixels );\
1150 b= AV_RN32(pixels+1);\
1151 l0= (a&0x03030303UL)\
1152 + (b&0x03030303UL)\
1153 + 0x02020202UL;\
1154 h0= ((a&0xFCFCFCFCUL)>>2)\
1155 + ((b&0xFCFCFCFCUL)>>2);\
1156 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1157 pixels+=line_size;\
1158 block +=line_size;\
1162 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1164 int j;\
1165 for(j=0; j<2; j++){\
1166 int i;\
1167 const uint32_t a= AV_RN32(pixels );\
1168 const uint32_t b= AV_RN32(pixels+1);\
1169 uint32_t l0= (a&0x03030303UL)\
1170 + (b&0x03030303UL)\
1171 + 0x02020202UL;\
1172 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1173 + ((b&0xFCFCFCFCUL)>>2);\
1174 uint32_t l1,h1;\
1176 pixels+=line_size;\
1177 for(i=0; i<h; i+=2){\
1178 uint32_t a= AV_RN32(pixels );\
1179 uint32_t b= AV_RN32(pixels+1);\
1180 l1= (a&0x03030303UL)\
1181 + (b&0x03030303UL);\
1182 h1= ((a&0xFCFCFCFCUL)>>2)\
1183 + ((b&0xFCFCFCFCUL)>>2);\
1184 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1185 pixels+=line_size;\
1186 block +=line_size;\
1187 a= AV_RN32(pixels );\
1188 b= AV_RN32(pixels+1);\
1189 l0= (a&0x03030303UL)\
1190 + (b&0x03030303UL)\
1191 + 0x02020202UL;\
1192 h0= ((a&0xFCFCFCFCUL)>>2)\
1193 + ((b&0xFCFCFCFCUL)>>2);\
1194 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1195 pixels+=line_size;\
1196 block +=line_size;\
1198 pixels+=4-line_size*(h+1);\
1199 block +=4-line_size*h;\
1203 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1205 int j;\
1206 for(j=0; j<2; j++){\
1207 int i;\
1208 const uint32_t a= AV_RN32(pixels );\
1209 const uint32_t b= AV_RN32(pixels+1);\
1210 uint32_t l0= (a&0x03030303UL)\
1211 + (b&0x03030303UL)\
1212 + 0x01010101UL;\
1213 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1214 + ((b&0xFCFCFCFCUL)>>2);\
1215 uint32_t l1,h1;\
1217 pixels+=line_size;\
1218 for(i=0; i<h; i+=2){\
1219 uint32_t a= AV_RN32(pixels );\
1220 uint32_t b= AV_RN32(pixels+1);\
1221 l1= (a&0x03030303UL)\
1222 + (b&0x03030303UL);\
1223 h1= ((a&0xFCFCFCFCUL)>>2)\
1224 + ((b&0xFCFCFCFCUL)>>2);\
1225 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1226 pixels+=line_size;\
1227 block +=line_size;\
1228 a= AV_RN32(pixels );\
1229 b= AV_RN32(pixels+1);\
1230 l0= (a&0x03030303UL)\
1231 + (b&0x03030303UL)\
1232 + 0x01010101UL;\
1233 h0= ((a&0xFCFCFCFCUL)>>2)\
1234 + ((b&0xFCFCFCFCUL)>>2);\
1235 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1236 pixels+=line_size;\
1237 block +=line_size;\
1239 pixels+=4-line_size*(h+1);\
1240 block +=4-line_size*h;\
1244 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1245 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1246 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1248 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1253 #define op_avg(a, b) a = rnd_avg32(a, b)
1254 #endif
1255 #define op_put(a, b) a = b
1257 PIXOP2(avg, op_avg)
1258 PIXOP2(put, op_put)
1259 #undef op_avg
1260 #undef op_put
1262 #define avg2(a,b) ((a+b+1)>>1)
1263 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1265 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1266 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1269 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1273 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1275 const int A=(16-x16)*(16-y16);
1276 const int B=( x16)*(16-y16);
1277 const int C=(16-x16)*( y16);
1278 const int D=( x16)*( y16);
1279 int i;
1281 for(i=0; i<h; i++)
1283 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1284 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1285 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1286 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1287 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1288 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1289 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1290 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1291 dst+= stride;
1292 src+= stride;
1296 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1297 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1299 int y, vx, vy;
1300 const int s= 1<<shift;
1302 width--;
1303 height--;
1305 for(y=0; y<h; y++){
1306 int x;
1308 vx= ox;
1309 vy= oy;
1310 for(x=0; x<8; x++){ //XXX FIXME optimize
1311 int src_x, src_y, frac_x, frac_y, index;
1313 src_x= vx>>16;
1314 src_y= vy>>16;
1315 frac_x= src_x&(s-1);
1316 frac_y= src_y&(s-1);
1317 src_x>>=shift;
1318 src_y>>=shift;
1320 if((unsigned)src_x < width){
1321 if((unsigned)src_y < height){
1322 index= src_x + src_y*stride;
1323 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1324 + src[index +1]* frac_x )*(s-frac_y)
1325 + ( src[index+stride ]*(s-frac_x)
1326 + src[index+stride+1]* frac_x )* frac_y
1327 + r)>>(shift*2);
1328 }else{
1329 index= src_x + av_clip(src_y, 0, height)*stride;
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1331 + src[index +1]* frac_x )*s
1332 + r)>>(shift*2);
1334 }else{
1335 if((unsigned)src_y < height){
1336 index= av_clip(src_x, 0, width) + src_y*stride;
1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1338 + src[index+stride ]* frac_y )*s
1339 + r)>>(shift*2);
1340 }else{
1341 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1342 dst[y*stride + x]= src[index ];
1346 vx+= dxx;
1347 vy+= dyx;
1349 ox += dxy;
1350 oy += dyy;
1354 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355 switch(width){
1356 case 2: put_pixels2_c (dst, src, stride, height); break;
1357 case 4: put_pixels4_c (dst, src, stride, height); break;
1358 case 8: put_pixels8_c (dst, src, stride, height); break;
1359 case 16:put_pixels16_c(dst, src, stride, height); break;
1363 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1364 int i,j;
1365 for (i=0; i < height; i++) {
1366 for (j=0; j < width; j++) {
1367 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1369 src += stride;
1370 dst += stride;
1374 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1375 int i,j;
1376 for (i=0; i < height; i++) {
1377 for (j=0; j < width; j++) {
1378 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1380 src += stride;
1381 dst += stride;
1385 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386 int i,j;
1387 for (i=0; i < height; i++) {
1388 for (j=0; j < width; j++) {
1389 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1391 src += stride;
1392 dst += stride;
1396 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1397 int i,j;
1398 for (i=0; i < height; i++) {
1399 for (j=0; j < width; j++) {
1400 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1402 src += stride;
1403 dst += stride;
1407 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1408 int i,j;
1409 for (i=0; i < height; i++) {
1410 for (j=0; j < width; j++) {
1411 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1413 src += stride;
1414 dst += stride;
1418 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1419 int i,j;
1420 for (i=0; i < height; i++) {
1421 for (j=0; j < width; j++) {
1422 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1424 src += stride;
1425 dst += stride;
1429 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1430 int i,j;
1431 for (i=0; i < height; i++) {
1432 for (j=0; j < width; j++) {
1433 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1435 src += stride;
1436 dst += stride;
1440 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1441 int i,j;
1442 for (i=0; i < height; i++) {
1443 for (j=0; j < width; j++) {
1444 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1446 src += stride;
1447 dst += stride;
1451 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1452 switch(width){
1453 case 2: avg_pixels2_c (dst, src, stride, height); break;
1454 case 4: avg_pixels4_c (dst, src, stride, height); break;
1455 case 8: avg_pixels8_c (dst, src, stride, height); break;
1456 case 16:avg_pixels16_c(dst, src, stride, height); break;
1460 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1461 int i,j;
1462 for (i=0; i < height; i++) {
1463 for (j=0; j < width; j++) {
1464 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1466 src += stride;
1467 dst += stride;
1471 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1472 int i,j;
1473 for (i=0; i < height; i++) {
1474 for (j=0; j < width; j++) {
1475 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1477 src += stride;
1478 dst += stride;
1482 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1483 int i,j;
1484 for (i=0; i < height; i++) {
1485 for (j=0; j < width; j++) {
1486 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1488 src += stride;
1489 dst += stride;
1493 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1494 int i,j;
1495 for (i=0; i < height; i++) {
1496 for (j=0; j < width; j++) {
1497 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1499 src += stride;
1500 dst += stride;
1504 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1505 int i,j;
1506 for (i=0; i < height; i++) {
1507 for (j=0; j < width; j++) {
1508 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1510 src += stride;
1511 dst += stride;
1515 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1516 int i,j;
1517 for (i=0; i < height; i++) {
1518 for (j=0; j < width; j++) {
1519 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1521 src += stride;
1522 dst += stride;
1526 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1527 int i,j;
1528 for (i=0; i < height; i++) {
1529 for (j=0; j < width; j++) {
1530 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1532 src += stride;
1533 dst += stride;
1537 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1538 int i,j;
1539 for (i=0; i < height; i++) {
1540 for (j=0; j < width; j++) {
1541 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1543 src += stride;
1544 dst += stride;
1547 #if 0
1548 #define TPEL_WIDTH(width)\
1549 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1550 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1551 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1552 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1553 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1567 #endif
1569 #define H264_CHROMA_MC(OPNAME, OP)\
1570 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1571 const int A=(8-x)*(8-y);\
1572 const int B=( x)*(8-y);\
1573 const int C=(8-x)*( y);\
1574 const int D=( x)*( y);\
1575 int i;\
1577 assert(x<8 && y<8 && x>=0 && y>=0);\
1579 if(D){\
1580 for(i=0; i<h; i++){\
1581 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1582 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1583 dst+= stride;\
1584 src+= stride;\
1586 }else{\
1587 const int E= B+C;\
1588 const int step= C ? stride : 1;\
1589 for(i=0; i<h; i++){\
1590 OP(dst[0], (A*src[0] + E*src[step+0]));\
1591 OP(dst[1], (A*src[1] + E*src[step+1]));\
1592 dst+= stride;\
1593 src+= stride;\
1598 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1599 const int A=(8-x)*(8-y);\
1600 const int B=( x)*(8-y);\
1601 const int C=(8-x)*( y);\
1602 const int D=( x)*( y);\
1603 int i;\
1605 assert(x<8 && y<8 && x>=0 && y>=0);\
1607 if(D){\
1608 for(i=0; i<h; i++){\
1609 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1610 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1611 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1612 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1613 dst+= stride;\
1614 src+= stride;\
1616 }else{\
1617 const int E= B+C;\
1618 const int step= C ? stride : 1;\
1619 for(i=0; i<h; i++){\
1620 OP(dst[0], (A*src[0] + E*src[step+0]));\
1621 OP(dst[1], (A*src[1] + E*src[step+1]));\
1622 OP(dst[2], (A*src[2] + E*src[step+2]));\
1623 OP(dst[3], (A*src[3] + E*src[step+3]));\
1624 dst+= stride;\
1625 src+= stride;\
1630 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1631 const int A=(8-x)*(8-y);\
1632 const int B=( x)*(8-y);\
1633 const int C=(8-x)*( y);\
1634 const int D=( x)*( y);\
1635 int i;\
1637 assert(x<8 && y<8 && x>=0 && y>=0);\
1639 if(D){\
1640 for(i=0; i<h; i++){\
1641 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1642 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1643 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1644 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1645 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1646 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1647 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1648 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1649 dst+= stride;\
1650 src+= stride;\
1652 }else{\
1653 const int E= B+C;\
1654 const int step= C ? stride : 1;\
1655 for(i=0; i<h; i++){\
1656 OP(dst[0], (A*src[0] + E*src[step+0]));\
1657 OP(dst[1], (A*src[1] + E*src[step+1]));\
1658 OP(dst[2], (A*src[2] + E*src[step+2]));\
1659 OP(dst[3], (A*src[3] + E*src[step+3]));\
1660 OP(dst[4], (A*src[4] + E*src[step+4]));\
1661 OP(dst[5], (A*src[5] + E*src[step+5]));\
1662 OP(dst[6], (A*src[6] + E*src[step+6]));\
1663 OP(dst[7], (A*src[7] + E*src[step+7]));\
1664 dst+= stride;\
1665 src+= stride;\
1670 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1671 #define op_put(a, b) a = (((b) + 32)>>6)
1673 H264_CHROMA_MC(put_ , op_put)
1674 H264_CHROMA_MC(avg_ , op_avg)
1675 #undef op_avg
1676 #undef op_put
1678 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1679 const int A=(8-x)*(8-y);
1680 const int B=( x)*(8-y);
1681 const int C=(8-x)*( y);
1682 const int D=( x)*( y);
1683 int i;
1685 assert(x<8 && y<8 && x>=0 && y>=0);
1687 for(i=0; i<h; i++)
1689 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1690 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1691 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1692 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1693 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1694 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1695 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1696 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1697 dst+= stride;
1698 src+= stride;
1702 #define QPEL_MC(r, OPNAME, RND, OP) \
1703 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1704 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1705 int i;\
1706 for(i=0; i<h; i++)\
1708 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1709 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1710 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1711 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1712 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1713 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1714 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1715 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1716 dst+=dstStride;\
1717 src+=srcStride;\
1721 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1722 const int w=8;\
1723 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1724 int i;\
1725 for(i=0; i<w; i++)\
1727 const int src0= src[0*srcStride];\
1728 const int src1= src[1*srcStride];\
1729 const int src2= src[2*srcStride];\
1730 const int src3= src[3*srcStride];\
1731 const int src4= src[4*srcStride];\
1732 const int src5= src[5*srcStride];\
1733 const int src6= src[6*srcStride];\
1734 const int src7= src[7*srcStride];\
1735 const int src8= src[8*srcStride];\
1736 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1737 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1738 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1739 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1740 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1741 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1742 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1743 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1744 dst++;\
1745 src++;\
1749 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1750 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1751 int i;\
1753 for(i=0; i<h; i++)\
1755 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1756 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1757 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1758 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1759 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1760 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1761 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1762 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1763 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1764 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1765 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1766 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1767 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1768 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1769 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1770 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1771 dst+=dstStride;\
1772 src+=srcStride;\
1776 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1777 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1778 int i;\
1779 const int w=16;\
1780 for(i=0; i<w; i++)\
1782 const int src0= src[0*srcStride];\
1783 const int src1= src[1*srcStride];\
1784 const int src2= src[2*srcStride];\
1785 const int src3= src[3*srcStride];\
1786 const int src4= src[4*srcStride];\
1787 const int src5= src[5*srcStride];\
1788 const int src6= src[6*srcStride];\
1789 const int src7= src[7*srcStride];\
1790 const int src8= src[8*srcStride];\
1791 const int src9= src[9*srcStride];\
1792 const int src10= src[10*srcStride];\
1793 const int src11= src[11*srcStride];\
1794 const int src12= src[12*srcStride];\
1795 const int src13= src[13*srcStride];\
1796 const int src14= src[14*srcStride];\
1797 const int src15= src[15*srcStride];\
1798 const int src16= src[16*srcStride];\
1799 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1800 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1801 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1802 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1803 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1804 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1805 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1806 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1807 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1808 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1809 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1810 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1811 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1812 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1813 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1814 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1815 dst++;\
1816 src++;\
1820 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1821 OPNAME ## pixels8_c(dst, src, stride, 8);\
1824 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t half[64];\
1826 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1827 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1830 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1831 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1834 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1835 uint8_t half[64];\
1836 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1837 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1840 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[16*9];\
1842 uint8_t half[64];\
1843 copy_block9(full, src, 16, stride, 9);\
1844 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1845 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1848 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[16*9];\
1850 copy_block9(full, src, 16, stride, 9);\
1851 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1854 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1856 uint8_t half[64];\
1857 copy_block9(full, src, 16, stride, 9);\
1858 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1859 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1861 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t full[16*9];\
1863 uint8_t halfH[72];\
1864 uint8_t halfV[64];\
1865 uint8_t halfHV[64];\
1866 copy_block9(full, src, 16, stride, 9);\
1867 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1868 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1869 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1870 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1872 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1873 uint8_t full[16*9];\
1874 uint8_t halfH[72];\
1875 uint8_t halfHV[64];\
1876 copy_block9(full, src, 16, stride, 9);\
1877 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1880 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1882 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t full[16*9];\
1884 uint8_t halfH[72];\
1885 uint8_t halfV[64];\
1886 uint8_t halfHV[64];\
1887 copy_block9(full, src, 16, stride, 9);\
1888 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1890 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1891 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1893 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1894 uint8_t full[16*9];\
1895 uint8_t halfH[72];\
1896 uint8_t halfHV[64];\
1897 copy_block9(full, src, 16, stride, 9);\
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1903 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[16*9];\
1905 uint8_t halfH[72];\
1906 uint8_t halfV[64];\
1907 uint8_t halfHV[64];\
1908 copy_block9(full, src, 16, stride, 9);\
1909 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1911 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1912 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1914 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1915 uint8_t full[16*9];\
1916 uint8_t halfH[72];\
1917 uint8_t halfHV[64];\
1918 copy_block9(full, src, 16, stride, 9);\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1924 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[16*9];\
1926 uint8_t halfH[72];\
1927 uint8_t halfV[64];\
1928 uint8_t halfHV[64];\
1929 copy_block9(full, src, 16, stride, 9);\
1930 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1932 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1933 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1935 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1936 uint8_t full[16*9];\
1937 uint8_t halfH[72];\
1938 uint8_t halfHV[64];\
1939 copy_block9(full, src, 16, stride, 9);\
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1945 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t halfH[72];\
1947 uint8_t halfHV[64];\
1948 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1949 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1950 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1952 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1953 uint8_t halfH[72];\
1954 uint8_t halfHV[64];\
1955 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1957 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1959 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1960 uint8_t full[16*9];\
1961 uint8_t halfH[72];\
1962 uint8_t halfV[64];\
1963 uint8_t halfHV[64];\
1964 copy_block9(full, src, 16, stride, 9);\
1965 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1966 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1967 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1968 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1970 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[16*9];\
1972 uint8_t halfH[72];\
1973 copy_block9(full, src, 16, stride, 9);\
1974 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1975 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1976 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1978 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[16*9];\
1980 uint8_t halfH[72];\
1981 uint8_t halfV[64];\
1982 uint8_t halfHV[64];\
1983 copy_block9(full, src, 16, stride, 9);\
1984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1985 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1987 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1989 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1990 uint8_t full[16*9];\
1991 uint8_t halfH[72];\
1992 copy_block9(full, src, 16, stride, 9);\
1993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1995 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1997 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t halfH[72];\
1999 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2000 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2002 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2003 OPNAME ## pixels16_c(dst, src, stride, 16);\
2006 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t half[256];\
2008 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2009 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2012 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2013 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2016 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t half[256];\
2018 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2019 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2022 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t half[256];\
2025 copy_block17(full, src, 24, stride, 17);\
2026 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2027 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2030 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2031 uint8_t full[24*17];\
2032 copy_block17(full, src, 24, stride, 17);\
2033 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2036 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[24*17];\
2038 uint8_t half[256];\
2039 copy_block17(full, src, 24, stride, 17);\
2040 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2041 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2043 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t full[24*17];\
2045 uint8_t halfH[272];\
2046 uint8_t halfV[256];\
2047 uint8_t halfHV[256];\
2048 copy_block17(full, src, 24, stride, 17);\
2049 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2050 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2051 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2052 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2054 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2055 uint8_t full[24*17];\
2056 uint8_t halfH[272];\
2057 uint8_t halfHV[256];\
2058 copy_block17(full, src, 24, stride, 17);\
2059 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2060 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2061 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2062 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2064 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2065 uint8_t full[24*17];\
2066 uint8_t halfH[272];\
2067 uint8_t halfV[256];\
2068 uint8_t halfHV[256];\
2069 copy_block17(full, src, 24, stride, 17);\
2070 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2071 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2072 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2073 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2075 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2076 uint8_t full[24*17];\
2077 uint8_t halfH[272];\
2078 uint8_t halfHV[256];\
2079 copy_block17(full, src, 24, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2085 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2086 uint8_t full[24*17];\
2087 uint8_t halfH[272];\
2088 uint8_t halfV[256];\
2089 uint8_t halfHV[256];\
2090 copy_block17(full, src, 24, stride, 17);\
2091 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2093 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2094 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2096 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2097 uint8_t full[24*17];\
2098 uint8_t halfH[272];\
2099 uint8_t halfHV[256];\
2100 copy_block17(full, src, 24, stride, 17);\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2106 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2107 uint8_t full[24*17];\
2108 uint8_t halfH[272];\
2109 uint8_t halfV[256];\
2110 uint8_t halfHV[256];\
2111 copy_block17(full, src, 24, stride, 17);\
2112 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2114 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2115 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2117 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2118 uint8_t full[24*17];\
2119 uint8_t halfH[272];\
2120 uint8_t halfHV[256];\
2121 copy_block17(full, src, 24, stride, 17);\
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2127 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2128 uint8_t halfH[272];\
2129 uint8_t halfHV[256];\
2130 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2131 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2132 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2134 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2135 uint8_t halfH[272];\
2136 uint8_t halfHV[256];\
2137 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2138 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2139 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2141 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2142 uint8_t full[24*17];\
2143 uint8_t halfH[272];\
2144 uint8_t halfV[256];\
2145 uint8_t halfHV[256];\
2146 copy_block17(full, src, 24, stride, 17);\
2147 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2148 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2149 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2150 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2152 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2153 uint8_t full[24*17];\
2154 uint8_t halfH[272];\
2155 copy_block17(full, src, 24, stride, 17);\
2156 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2157 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2158 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2160 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2161 uint8_t full[24*17];\
2162 uint8_t halfH[272];\
2163 uint8_t halfV[256];\
2164 uint8_t halfHV[256];\
2165 copy_block17(full, src, 24, stride, 17);\
2166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2169 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2171 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2172 uint8_t full[24*17];\
2173 uint8_t halfH[272];\
2174 copy_block17(full, src, 24, stride, 17);\
2175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2177 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2179 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2180 uint8_t halfH[272];\
2181 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2182 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2185 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2186 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2187 #define op_put(a, b) a = cm[((b) + 16)>>5]
2188 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2190 QPEL_MC(0, put_ , _ , op_put)
2191 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2192 QPEL_MC(0, avg_ , _ , op_avg)
2193 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2194 #undef op_avg
2195 #undef op_avg_no_rnd
2196 #undef op_put
2197 #undef op_put_no_rnd
2199 #if 1
2200 #define H264_LOWPASS(OPNAME, OP, OP2) \
2201 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202 const int h=2;\
2203 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2204 int i;\
2205 for(i=0; i<h; i++)\
2207 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2208 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2209 dst+=dstStride;\
2210 src+=srcStride;\
2214 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2215 const int w=2;\
2216 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2217 int i;\
2218 for(i=0; i<w; i++)\
2220 const int srcB= src[-2*srcStride];\
2221 const int srcA= src[-1*srcStride];\
2222 const int src0= src[0 *srcStride];\
2223 const int src1= src[1 *srcStride];\
2224 const int src2= src[2 *srcStride];\
2225 const int src3= src[3 *srcStride];\
2226 const int src4= src[4 *srcStride];\
2227 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2228 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2229 dst++;\
2230 src++;\
2234 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2235 const int h=2;\
2236 const int w=2;\
2237 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2238 int i;\
2239 src -= 2*srcStride;\
2240 for(i=0; i<h+5; i++)\
2242 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2243 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2244 tmp+=tmpStride;\
2245 src+=srcStride;\
2247 tmp -= tmpStride*(h+5-2);\
2248 for(i=0; i<w; i++)\
2250 const int tmpB= tmp[-2*tmpStride];\
2251 const int tmpA= tmp[-1*tmpStride];\
2252 const int tmp0= tmp[0 *tmpStride];\
2253 const int tmp1= tmp[1 *tmpStride];\
2254 const int tmp2= tmp[2 *tmpStride];\
2255 const int tmp3= tmp[3 *tmpStride];\
2256 const int tmp4= tmp[4 *tmpStride];\
2257 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2258 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2259 dst++;\
2260 tmp++;\
2263 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2264 const int h=4;\
2265 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2266 int i;\
2267 for(i=0; i<h; i++)\
2269 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2270 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2271 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2272 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2273 dst+=dstStride;\
2274 src+=srcStride;\
2278 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2279 const int w=4;\
2280 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281 int i;\
2282 for(i=0; i<w; i++)\
2284 const int srcB= src[-2*srcStride];\
2285 const int srcA= src[-1*srcStride];\
2286 const int src0= src[0 *srcStride];\
2287 const int src1= src[1 *srcStride];\
2288 const int src2= src[2 *srcStride];\
2289 const int src3= src[3 *srcStride];\
2290 const int src4= src[4 *srcStride];\
2291 const int src5= src[5 *srcStride];\
2292 const int src6= src[6 *srcStride];\
2293 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2294 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2295 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2296 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2297 dst++;\
2298 src++;\
2302 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2303 const int h=4;\
2304 const int w=4;\
2305 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2306 int i;\
2307 src -= 2*srcStride;\
2308 for(i=0; i<h+5; i++)\
2310 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2311 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2312 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2313 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2314 tmp+=tmpStride;\
2315 src+=srcStride;\
2317 tmp -= tmpStride*(h+5-2);\
2318 for(i=0; i<w; i++)\
2320 const int tmpB= tmp[-2*tmpStride];\
2321 const int tmpA= tmp[-1*tmpStride];\
2322 const int tmp0= tmp[0 *tmpStride];\
2323 const int tmp1= tmp[1 *tmpStride];\
2324 const int tmp2= tmp[2 *tmpStride];\
2325 const int tmp3= tmp[3 *tmpStride];\
2326 const int tmp4= tmp[4 *tmpStride];\
2327 const int tmp5= tmp[5 *tmpStride];\
2328 const int tmp6= tmp[6 *tmpStride];\
2329 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2330 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2331 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2332 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2333 dst++;\
2334 tmp++;\
2338 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339 const int h=8;\
2340 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2341 int i;\
2342 for(i=0; i<h; i++)\
2344 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2345 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2346 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2347 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2348 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2349 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2350 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2351 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2352 dst+=dstStride;\
2353 src+=srcStride;\
2357 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2358 const int w=8;\
2359 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2360 int i;\
2361 for(i=0; i<w; i++)\
2363 const int srcB= src[-2*srcStride];\
2364 const int srcA= src[-1*srcStride];\
2365 const int src0= src[0 *srcStride];\
2366 const int src1= src[1 *srcStride];\
2367 const int src2= src[2 *srcStride];\
2368 const int src3= src[3 *srcStride];\
2369 const int src4= src[4 *srcStride];\
2370 const int src5= src[5 *srcStride];\
2371 const int src6= src[6 *srcStride];\
2372 const int src7= src[7 *srcStride];\
2373 const int src8= src[8 *srcStride];\
2374 const int src9= src[9 *srcStride];\
2375 const int src10=src[10*srcStride];\
2376 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2377 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2378 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2379 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2380 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2381 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2382 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2383 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2384 dst++;\
2385 src++;\
2389 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2390 const int h=8;\
2391 const int w=8;\
2392 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2393 int i;\
2394 src -= 2*srcStride;\
2395 for(i=0; i<h+5; i++)\
2397 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2398 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2399 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2400 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2401 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2402 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2403 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2404 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2405 tmp+=tmpStride;\
2406 src+=srcStride;\
2408 tmp -= tmpStride*(h+5-2);\
2409 for(i=0; i<w; i++)\
2411 const int tmpB= tmp[-2*tmpStride];\
2412 const int tmpA= tmp[-1*tmpStride];\
2413 const int tmp0= tmp[0 *tmpStride];\
2414 const int tmp1= tmp[1 *tmpStride];\
2415 const int tmp2= tmp[2 *tmpStride];\
2416 const int tmp3= tmp[3 *tmpStride];\
2417 const int tmp4= tmp[4 *tmpStride];\
2418 const int tmp5= tmp[5 *tmpStride];\
2419 const int tmp6= tmp[6 *tmpStride];\
2420 const int tmp7= tmp[7 *tmpStride];\
2421 const int tmp8= tmp[8 *tmpStride];\
2422 const int tmp9= tmp[9 *tmpStride];\
2423 const int tmp10=tmp[10*tmpStride];\
2424 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2425 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2426 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2427 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2428 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2429 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2430 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2431 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2432 dst++;\
2433 tmp++;\
2437 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2438 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2439 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2440 src += 8*srcStride;\
2441 dst += 8*dstStride;\
2442 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2443 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2446 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2447 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2448 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2449 src += 8*srcStride;\
2450 dst += 8*dstStride;\
2451 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2452 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2455 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2456 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2457 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2458 src += 8*srcStride;\
2459 dst += 8*dstStride;\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2461 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2464 #define H264_MC(OPNAME, SIZE) \
2465 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2466 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2470 uint8_t half[SIZE*SIZE];\
2471 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2472 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2476 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2480 uint8_t half[SIZE*SIZE];\
2481 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2482 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2485 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2486 uint8_t full[SIZE*(SIZE+5)];\
2487 uint8_t * const full_mid= full + SIZE*2;\
2488 uint8_t half[SIZE*SIZE];\
2489 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2490 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2491 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2494 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2495 uint8_t full[SIZE*(SIZE+5)];\
2496 uint8_t * const full_mid= full + SIZE*2;\
2497 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2498 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2502 uint8_t full[SIZE*(SIZE+5)];\
2503 uint8_t * const full_mid= full + SIZE*2;\
2504 uint8_t half[SIZE*SIZE];\
2505 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2506 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2507 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2511 uint8_t full[SIZE*(SIZE+5)];\
2512 uint8_t * const full_mid= full + SIZE*2;\
2513 uint8_t halfH[SIZE*SIZE];\
2514 uint8_t halfV[SIZE*SIZE];\
2515 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2516 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2517 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2518 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2521 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2522 uint8_t full[SIZE*(SIZE+5)];\
2523 uint8_t * const full_mid= full + SIZE*2;\
2524 uint8_t halfH[SIZE*SIZE];\
2525 uint8_t halfV[SIZE*SIZE];\
2526 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2527 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2528 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2533 uint8_t full[SIZE*(SIZE+5)];\
2534 uint8_t * const full_mid= full + SIZE*2;\
2535 uint8_t halfH[SIZE*SIZE];\
2536 uint8_t halfV[SIZE*SIZE];\
2537 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2538 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2539 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2540 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2543 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2544 uint8_t full[SIZE*(SIZE+5)];\
2545 uint8_t * const full_mid= full + SIZE*2;\
2546 uint8_t halfH[SIZE*SIZE];\
2547 uint8_t halfV[SIZE*SIZE];\
2548 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2549 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2550 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2551 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2555 int16_t tmp[SIZE*(SIZE+5)];\
2556 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2559 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2560 int16_t tmp[SIZE*(SIZE+5)];\
2561 uint8_t halfH[SIZE*SIZE];\
2562 uint8_t halfHV[SIZE*SIZE];\
2563 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2564 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2565 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2569 int16_t tmp[SIZE*(SIZE+5)];\
2570 uint8_t halfH[SIZE*SIZE];\
2571 uint8_t halfHV[SIZE*SIZE];\
2572 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2573 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2574 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2577 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2578 uint8_t full[SIZE*(SIZE+5)];\
2579 uint8_t * const full_mid= full + SIZE*2;\
2580 int16_t tmp[SIZE*(SIZE+5)];\
2581 uint8_t halfV[SIZE*SIZE];\
2582 uint8_t halfHV[SIZE*SIZE];\
2583 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2584 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2585 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2586 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2589 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2590 uint8_t full[SIZE*(SIZE+5)];\
2591 uint8_t * const full_mid= full + SIZE*2;\
2592 int16_t tmp[SIZE*(SIZE+5)];\
2593 uint8_t halfV[SIZE*SIZE];\
2594 uint8_t halfHV[SIZE*SIZE];\
2595 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2596 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2597 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2598 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2601 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2602 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2603 #define op_put(a, b) a = cm[((b) + 16)>>5]
2604 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2605 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2607 H264_LOWPASS(put_ , op_put, op2_put)
2608 H264_LOWPASS(avg_ , op_avg, op2_avg)
2609 H264_MC(put_, 2)
2610 H264_MC(put_, 4)
2611 H264_MC(put_, 8)
2612 H264_MC(put_, 16)
2613 H264_MC(avg_, 4)
2614 H264_MC(avg_, 8)
2615 H264_MC(avg_, 16)
2617 #undef op_avg
2618 #undef op_put
2619 #undef op2_avg
2620 #undef op2_put
2621 #endif
2623 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2624 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2625 #define H264_WEIGHT(W,H) \
2626 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2627 int y; \
2628 offset <<= log2_denom; \
2629 if(log2_denom) offset += 1<<(log2_denom-1); \
2630 for(y=0; y<H; y++, block += stride){ \
2631 op_scale1(0); \
2632 op_scale1(1); \
2633 if(W==2) continue; \
2634 op_scale1(2); \
2635 op_scale1(3); \
2636 if(W==4) continue; \
2637 op_scale1(4); \
2638 op_scale1(5); \
2639 op_scale1(6); \
2640 op_scale1(7); \
2641 if(W==8) continue; \
2642 op_scale1(8); \
2643 op_scale1(9); \
2644 op_scale1(10); \
2645 op_scale1(11); \
2646 op_scale1(12); \
2647 op_scale1(13); \
2648 op_scale1(14); \
2649 op_scale1(15); \
2652 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2653 int y; \
2654 offset = ((offset + 1) | 1) << log2_denom; \
2655 for(y=0; y<H; y++, dst += stride, src += stride){ \
2656 op_scale2(0); \
2657 op_scale2(1); \
2658 if(W==2) continue; \
2659 op_scale2(2); \
2660 op_scale2(3); \
2661 if(W==4) continue; \
2662 op_scale2(4); \
2663 op_scale2(5); \
2664 op_scale2(6); \
2665 op_scale2(7); \
2666 if(W==8) continue; \
2667 op_scale2(8); \
2668 op_scale2(9); \
2669 op_scale2(10); \
2670 op_scale2(11); \
2671 op_scale2(12); \
2672 op_scale2(13); \
2673 op_scale2(14); \
2674 op_scale2(15); \
2678 H264_WEIGHT(16,16)
2679 H264_WEIGHT(16,8)
2680 H264_WEIGHT(8,16)
2681 H264_WEIGHT(8,8)
2682 H264_WEIGHT(8,4)
2683 H264_WEIGHT(4,8)
2684 H264_WEIGHT(4,4)
2685 H264_WEIGHT(4,2)
2686 H264_WEIGHT(2,4)
2687 H264_WEIGHT(2,2)
2689 #undef op_scale1
2690 #undef op_scale2
2691 #undef H264_WEIGHT
2693 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2694 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2695 int i;
2697 for(i=0; i<h; i++){
2698 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2699 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2700 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2701 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2702 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2703 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2704 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2705 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2706 dst+=dstStride;
2707 src+=srcStride;
2711 #ifdef CONFIG_CAVS_DECODER
2712 /* AVS specific */
2713 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2715 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2716 put_pixels8_c(dst, src, stride, 8);
2718 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719 avg_pixels8_c(dst, src, stride, 8);
2721 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722 put_pixels16_c(dst, src, stride, 16);
2724 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725 avg_pixels16_c(dst, src, stride, 16);
2727 #endif /* CONFIG_CAVS_DECODER */
2729 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2730 /* VC-1 specific */
2731 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2733 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2734 put_pixels8_c(dst, src, stride, 8);
2736 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2738 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2740 /* H264 specific */
2741 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2743 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2744 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2745 int i;
2747 for(i=0; i<w; i++){
2748 const int src_1= src[ -srcStride];
2749 const int src0 = src[0 ];
2750 const int src1 = src[ srcStride];
2751 const int src2 = src[2*srcStride];
2752 const int src3 = src[3*srcStride];
2753 const int src4 = src[4*srcStride];
2754 const int src5 = src[5*srcStride];
2755 const int src6 = src[6*srcStride];
2756 const int src7 = src[7*srcStride];
2757 const int src8 = src[8*srcStride];
2758 const int src9 = src[9*srcStride];
2759 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2760 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2761 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2762 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2763 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2764 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2765 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2766 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2767 src++;
2768 dst++;
2772 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2773 put_pixels8_c(dst, src, stride, 8);
2776 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2777 uint8_t half[64];
2778 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2779 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2782 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2783 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2786 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2787 uint8_t half[64];
2788 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2789 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2792 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2793 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2796 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2797 uint8_t halfH[88];
2798 uint8_t halfV[64];
2799 uint8_t halfHV[64];
2800 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2801 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2802 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2803 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2805 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2806 uint8_t halfH[88];
2807 uint8_t halfV[64];
2808 uint8_t halfHV[64];
2809 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2810 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2811 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2812 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2814 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2815 uint8_t halfH[88];
2816 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2817 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2820 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2821 if(ENABLE_ANY_H263) {
2822 int x;
2823 const int strength= ff_h263_loop_filter_strength[qscale];
2825 for(x=0; x<8; x++){
2826 int d1, d2, ad1;
2827 int p0= src[x-2*stride];
2828 int p1= src[x-1*stride];
2829 int p2= src[x+0*stride];
2830 int p3= src[x+1*stride];
2831 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2833 if (d<-2*strength) d1= 0;
2834 else if(d<- strength) d1=-2*strength - d;
2835 else if(d< strength) d1= d;
2836 else if(d< 2*strength) d1= 2*strength - d;
2837 else d1= 0;
2839 p1 += d1;
2840 p2 -= d1;
2841 if(p1&256) p1= ~(p1>>31);
2842 if(p2&256) p2= ~(p2>>31);
2844 src[x-1*stride] = p1;
2845 src[x+0*stride] = p2;
2847 ad1= FFABS(d1)>>1;
2849 d2= av_clip((p0-p3)/4, -ad1, ad1);
2851 src[x-2*stride] = p0 - d2;
2852 src[x+ stride] = p3 + d2;
2857 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2858 if(ENABLE_ANY_H263) {
2859 int y;
2860 const int strength= ff_h263_loop_filter_strength[qscale];
2862 for(y=0; y<8; y++){
2863 int d1, d2, ad1;
2864 int p0= src[y*stride-2];
2865 int p1= src[y*stride-1];
2866 int p2= src[y*stride+0];
2867 int p3= src[y*stride+1];
2868 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2870 if (d<-2*strength) d1= 0;
2871 else if(d<- strength) d1=-2*strength - d;
2872 else if(d< strength) d1= d;
2873 else if(d< 2*strength) d1= 2*strength - d;
2874 else d1= 0;
2876 p1 += d1;
2877 p2 -= d1;
2878 if(p1&256) p1= ~(p1>>31);
2879 if(p2&256) p2= ~(p2>>31);
2881 src[y*stride-1] = p1;
2882 src[y*stride+0] = p2;
2884 ad1= FFABS(d1)>>1;
2886 d2= av_clip((p0-p3)/4, -ad1, ad1);
2888 src[y*stride-2] = p0 - d2;
2889 src[y*stride+1] = p3 + d2;
2894 static void h261_loop_filter_c(uint8_t *src, int stride){
2895 int x,y,xy,yz;
2896 int temp[64];
2898 for(x=0; x<8; x++){
2899 temp[x ] = 4*src[x ];
2900 temp[x + 7*8] = 4*src[x + 7*stride];
2902 for(y=1; y<7; y++){
2903 for(x=0; x<8; x++){
2904 xy = y * stride + x;
2905 yz = y * 8 + x;
2906 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2910 for(y=0; y<8; y++){
2911 src[ y*stride] = (temp[ y*8] + 2)>>2;
2912 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2913 for(x=1; x<7; x++){
2914 xy = y * stride + x;
2915 yz = y * 8 + x;
2916 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2921 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2923 int i, d;
2924 for( i = 0; i < 4; i++ ) {
2925 if( tc0[i] < 0 ) {
2926 pix += 4*ystride;
2927 continue;
2929 for( d = 0; d < 4; d++ ) {
2930 const int p0 = pix[-1*xstride];
2931 const int p1 = pix[-2*xstride];
2932 const int p2 = pix[-3*xstride];
2933 const int q0 = pix[0];
2934 const int q1 = pix[1*xstride];
2935 const int q2 = pix[2*xstride];
2937 if( FFABS( p0 - q0 ) < alpha &&
2938 FFABS( p1 - p0 ) < beta &&
2939 FFABS( q1 - q0 ) < beta ) {
2941 int tc = tc0[i];
2942 int i_delta;
2944 if( FFABS( p2 - p0 ) < beta ) {
2945 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2946 tc++;
2948 if( FFABS( q2 - q0 ) < beta ) {
2949 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2950 tc++;
2953 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2954 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2955 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2957 pix += ystride;
2961 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2963 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2965 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2967 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2970 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2972 int i, d;
2973 for( i = 0; i < 4; i++ ) {
2974 const int tc = tc0[i];
2975 if( tc <= 0 ) {
2976 pix += 2*ystride;
2977 continue;
2979 for( d = 0; d < 2; d++ ) {
2980 const int p0 = pix[-1*xstride];
2981 const int p1 = pix[-2*xstride];
2982 const int q0 = pix[0];
2983 const int q1 = pix[1*xstride];
2985 if( FFABS( p0 - q0 ) < alpha &&
2986 FFABS( p1 - p0 ) < beta &&
2987 FFABS( q1 - q0 ) < beta ) {
2989 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2991 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2992 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2994 pix += ystride;
2998 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3000 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3002 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3004 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3007 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3009 int d;
3010 for( d = 0; d < 8; d++ ) {
3011 const int p0 = pix[-1*xstride];
3012 const int p1 = pix[-2*xstride];
3013 const int q0 = pix[0];
3014 const int q1 = pix[1*xstride];
3016 if( FFABS( p0 - q0 ) < alpha &&
3017 FFABS( p1 - p0 ) < beta &&
3018 FFABS( q1 - q0 ) < beta ) {
3020 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3021 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3023 pix += ystride;
3026 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3028 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3030 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3032 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3035 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3037 int s, i;
3039 s = 0;
3040 for(i=0;i<h;i++) {
3041 s += abs(pix1[0] - pix2[0]);
3042 s += abs(pix1[1] - pix2[1]);
3043 s += abs(pix1[2] - pix2[2]);
3044 s += abs(pix1[3] - pix2[3]);
3045 s += abs(pix1[4] - pix2[4]);
3046 s += abs(pix1[5] - pix2[5]);
3047 s += abs(pix1[6] - pix2[6]);
3048 s += abs(pix1[7] - pix2[7]);
3049 s += abs(pix1[8] - pix2[8]);
3050 s += abs(pix1[9] - pix2[9]);
3051 s += abs(pix1[10] - pix2[10]);
3052 s += abs(pix1[11] - pix2[11]);
3053 s += abs(pix1[12] - pix2[12]);
3054 s += abs(pix1[13] - pix2[13]);
3055 s += abs(pix1[14] - pix2[14]);
3056 s += abs(pix1[15] - pix2[15]);
3057 pix1 += line_size;
3058 pix2 += line_size;
3060 return s;
3063 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3065 int s, i;
3067 s = 0;
3068 for(i=0;i<h;i++) {
3069 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3070 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3071 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3072 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3073 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3074 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3075 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3076 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3077 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3078 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3079 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3080 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3081 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3082 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3083 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3084 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3085 pix1 += line_size;
3086 pix2 += line_size;
3088 return s;
3091 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3093 int s, i;
3094 uint8_t *pix3 = pix2 + line_size;
3096 s = 0;
3097 for(i=0;i<h;i++) {
3098 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3099 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3100 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3101 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3102 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3103 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3104 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3105 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3106 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3107 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3108 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3109 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3110 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3111 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3112 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3113 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3114 pix1 += line_size;
3115 pix2 += line_size;
3116 pix3 += line_size;
3118 return s;
3121 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3123 int s, i;
3124 uint8_t *pix3 = pix2 + line_size;
3126 s = 0;
3127 for(i=0;i<h;i++) {
3128 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3129 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3130 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3131 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3132 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3133 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3134 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3135 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3136 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3137 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3138 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3139 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3140 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3141 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3142 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3143 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3144 pix1 += line_size;
3145 pix2 += line_size;
3146 pix3 += line_size;
3148 return s;
3151 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3153 int s, i;
3155 s = 0;
3156 for(i=0;i<h;i++) {
3157 s += abs(pix1[0] - pix2[0]);
3158 s += abs(pix1[1] - pix2[1]);
3159 s += abs(pix1[2] - pix2[2]);
3160 s += abs(pix1[3] - pix2[3]);
3161 s += abs(pix1[4] - pix2[4]);
3162 s += abs(pix1[5] - pix2[5]);
3163 s += abs(pix1[6] - pix2[6]);
3164 s += abs(pix1[7] - pix2[7]);
3165 pix1 += line_size;
3166 pix2 += line_size;
3168 return s;
3171 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3173 int s, i;
3175 s = 0;
3176 for(i=0;i<h;i++) {
3177 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3178 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3179 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3180 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3181 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3182 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3183 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3184 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3185 pix1 += line_size;
3186 pix2 += line_size;
3188 return s;
3191 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3193 int s, i;
3194 uint8_t *pix3 = pix2 + line_size;
3196 s = 0;
3197 for(i=0;i<h;i++) {
3198 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3199 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3200 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3201 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3202 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3203 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3204 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3205 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3206 pix1 += line_size;
3207 pix2 += line_size;
3208 pix3 += line_size;
3210 return s;
3213 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3215 int s, i;
3216 uint8_t *pix3 = pix2 + line_size;
3218 s = 0;
3219 for(i=0;i<h;i++) {
3220 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3221 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3222 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3223 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3224 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3225 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3226 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3227 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3228 pix1 += line_size;
3229 pix2 += line_size;
3230 pix3 += line_size;
3232 return s;
3235 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3236 MpegEncContext *c = v;
3237 int score1=0;
3238 int score2=0;
3239 int x,y;
3241 for(y=0; y<h; y++){
3242 for(x=0; x<16; x++){
3243 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3245 if(y+1<h){
3246 for(x=0; x<15; x++){
3247 score2+= FFABS( s1[x ] - s1[x +stride]
3248 - s1[x+1] + s1[x+1+stride])
3249 -FFABS( s2[x ] - s2[x +stride]
3250 - s2[x+1] + s2[x+1+stride]);
3253 s1+= stride;
3254 s2+= stride;
3257 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3258 else return score1 + FFABS(score2)*8;
3261 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3262 MpegEncContext *c = v;
3263 int score1=0;
3264 int score2=0;
3265 int x,y;
3267 for(y=0; y<h; y++){
3268 for(x=0; x<8; x++){
3269 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3271 if(y+1<h){
3272 for(x=0; x<7; x++){
3273 score2+= FFABS( s1[x ] - s1[x +stride]
3274 - s1[x+1] + s1[x+1+stride])
3275 -FFABS( s2[x ] - s2[x +stride]
3276 - s2[x+1] + s2[x+1+stride]);
3279 s1+= stride;
3280 s2+= stride;
3283 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3284 else return score1 + FFABS(score2)*8;
3287 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3288 int i;
3289 unsigned int sum=0;
3291 for(i=0; i<8*8; i++){
3292 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3293 int w= weight[i];
3294 b>>= RECON_SHIFT;
3295 assert(-512<b && b<512);
3297 sum += (w*b)*(w*b)>>4;
3299 return sum>>2;
3302 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3303 int i;
3305 for(i=0; i<8*8; i++){
3306 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3311 * permutes an 8x8 block.
3312 * @param block the block which will be permuted according to the given permutation vector
3313 * @param permutation the permutation vector
3314 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3315 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3316 * (inverse) permutated to scantable order!
3318 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3320 int i;
3321 DCTELEM temp[64];
3323 if(last<=0) return;
3324 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3326 for(i=0; i<=last; i++){
3327 const int j= scantable[i];
3328 temp[j]= block[j];
3329 block[j]=0;
3332 for(i=0; i<=last; i++){
3333 const int j= scantable[i];
3334 const int perm_j= permutation[j];
3335 block[perm_j]= temp[j];
3339 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3340 return 0;
3343 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3344 int i;
3346 memset(cmp, 0, sizeof(void*)*5);
3348 for(i=0; i<5; i++){
3349 switch(type&0xFF){
3350 case FF_CMP_SAD:
3351 cmp[i]= c->sad[i];
3352 break;
3353 case FF_CMP_SATD:
3354 cmp[i]= c->hadamard8_diff[i];
3355 break;
3356 case FF_CMP_SSE:
3357 cmp[i]= c->sse[i];
3358 break;
3359 case FF_CMP_DCT:
3360 cmp[i]= c->dct_sad[i];
3361 break;
3362 case FF_CMP_DCT264:
3363 cmp[i]= c->dct264_sad[i];
3364 break;
3365 case FF_CMP_DCTMAX:
3366 cmp[i]= c->dct_max[i];
3367 break;
3368 case FF_CMP_PSNR:
3369 cmp[i]= c->quant_psnr[i];
3370 break;
3371 case FF_CMP_BIT:
3372 cmp[i]= c->bit[i];
3373 break;
3374 case FF_CMP_RD:
3375 cmp[i]= c->rd[i];
3376 break;
3377 case FF_CMP_VSAD:
3378 cmp[i]= c->vsad[i];
3379 break;
3380 case FF_CMP_VSSE:
3381 cmp[i]= c->vsse[i];
3382 break;
3383 case FF_CMP_ZERO:
3384 cmp[i]= zero_cmp;
3385 break;
3386 case FF_CMP_NSSE:
3387 cmp[i]= c->nsse[i];
3388 break;
3389 #ifdef CONFIG_SNOW_ENCODER
3390 case FF_CMP_W53:
3391 cmp[i]= c->w53[i];
3392 break;
3393 case FF_CMP_W97:
3394 cmp[i]= c->w97[i];
3395 break;
3396 #endif
3397 default:
3398 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3404 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3406 static void clear_blocks_c(DCTELEM *blocks)
3408 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3411 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3412 long i;
3413 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3414 long a = *(long*)(src+i);
3415 long b = *(long*)(dst+i);
3416 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3418 for(; i<w; i++)
3419 dst[i+0] += src[i+0];
3422 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3423 long i;
3424 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3425 long a = *(long*)(src1+i);
3426 long b = *(long*)(src2+i);
3427 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3429 for(; i<w; i++)
3430 dst[i] = src1[i]+src2[i];
3433 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3434 long i;
3435 #ifndef HAVE_FAST_UNALIGNED
3436 if((long)src2 & (sizeof(long)-1)){
3437 for(i=0; i+7<w; i+=8){
3438 dst[i+0] = src1[i+0]-src2[i+0];
3439 dst[i+1] = src1[i+1]-src2[i+1];
3440 dst[i+2] = src1[i+2]-src2[i+2];
3441 dst[i+3] = src1[i+3]-src2[i+3];
3442 dst[i+4] = src1[i+4]-src2[i+4];
3443 dst[i+5] = src1[i+5]-src2[i+5];
3444 dst[i+6] = src1[i+6]-src2[i+6];
3445 dst[i+7] = src1[i+7]-src2[i+7];
3447 }else
3448 #endif
3449 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3450 long a = *(long*)(src1+i);
3451 long b = *(long*)(src2+i);
3452 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3454 for(; i<w; i++)
3455 dst[i+0] = src1[i+0]-src2[i+0];
3458 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3459 int i;
3460 uint8_t l, lt;
3462 l= *left;
3463 lt= *left_top;
3465 for(i=0; i<w; i++){
3466 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3467 lt= src1[i];
3468 l= src2[i];
3469 dst[i]= l - pred;
3472 *left= l;
3473 *left_top= lt;
3476 #define BUTTERFLY2(o1,o2,i1,i2) \
3477 o1= (i1)+(i2);\
3478 o2= (i1)-(i2);
3480 #define BUTTERFLY1(x,y) \
3482 int a,b;\
3483 a= x;\
3484 b= y;\
3485 x= a+b;\
3486 y= a-b;\
3489 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3491 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3492 int i;
3493 int temp[64];
3494 int sum=0;
3496 assert(h==8);
3498 for(i=0; i<8; i++){
3499 //FIXME try pointer walks
3500 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3501 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3502 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3503 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3505 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3506 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3507 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3508 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3510 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3511 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3512 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3513 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3516 for(i=0; i<8; i++){
3517 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3518 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3519 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3520 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3522 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3523 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3524 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3525 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3527 sum +=
3528 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3529 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3530 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3531 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3533 #if 0
3534 static int maxi=0;
3535 if(sum>maxi){
3536 maxi=sum;
3537 printf("MAX:%d\n", maxi);
3539 #endif
3540 return sum;
3543 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3544 int i;
3545 int temp[64];
3546 int sum=0;
3548 assert(h==8);
3550 for(i=0; i<8; i++){
3551 //FIXME try pointer walks
3552 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3553 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3554 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3555 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3557 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3558 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3559 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3560 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3562 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3563 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3564 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3565 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3568 for(i=0; i<8; i++){
3569 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3570 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3571 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3572 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3574 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3575 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3576 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3577 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3579 sum +=
3580 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3581 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3582 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3583 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3586 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3588 return sum;
3591 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3592 MpegEncContext * const s= (MpegEncContext *)c;
3593 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3594 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3596 assert(h==8);
3598 s->dsp.diff_pixels(temp, src1, src2, stride);
3599 s->dsp.fdct(temp);
3600 return s->dsp.sum_abs_dctelem(temp);
3603 #ifdef CONFIG_GPL
3604 #define DCT8_1D {\
3605 const int s07 = SRC(0) + SRC(7);\
3606 const int s16 = SRC(1) + SRC(6);\
3607 const int s25 = SRC(2) + SRC(5);\
3608 const int s34 = SRC(3) + SRC(4);\
3609 const int a0 = s07 + s34;\
3610 const int a1 = s16 + s25;\
3611 const int a2 = s07 - s34;\
3612 const int a3 = s16 - s25;\
3613 const int d07 = SRC(0) - SRC(7);\
3614 const int d16 = SRC(1) - SRC(6);\
3615 const int d25 = SRC(2) - SRC(5);\
3616 const int d34 = SRC(3) - SRC(4);\
3617 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3618 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3619 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3620 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3621 DST(0, a0 + a1 ) ;\
3622 DST(1, a4 + (a7>>2)) ;\
3623 DST(2, a2 + (a3>>1)) ;\
3624 DST(3, a5 + (a6>>2)) ;\
3625 DST(4, a0 - a1 ) ;\
3626 DST(5, a6 - (a5>>2)) ;\
3627 DST(6, (a2>>1) - a3 ) ;\
3628 DST(7, (a4>>2) - a7 ) ;\
3631 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3632 MpegEncContext * const s= (MpegEncContext *)c;
3633 DCTELEM dct[8][8];
3634 int i;
3635 int sum=0;
3637 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3639 #define SRC(x) dct[i][x]
3640 #define DST(x,v) dct[i][x]= v
3641 for( i = 0; i < 8; i++ )
3642 DCT8_1D
3643 #undef SRC
3644 #undef DST
3646 #define SRC(x) dct[x][i]
3647 #define DST(x,v) sum += FFABS(v)
3648 for( i = 0; i < 8; i++ )
3649 DCT8_1D
3650 #undef SRC
3651 #undef DST
3652 return sum;
3654 #endif
3656 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3657 MpegEncContext * const s= (MpegEncContext *)c;
3658 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3659 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3660 int sum=0, i;
3662 assert(h==8);
3664 s->dsp.diff_pixels(temp, src1, src2, stride);
3665 s->dsp.fdct(temp);
3667 for(i=0; i<64; i++)
3668 sum= FFMAX(sum, FFABS(temp[i]));
3670 return sum;
3673 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3674 MpegEncContext * const s= (MpegEncContext *)c;
3675 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3676 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3677 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3678 int sum=0, i;
3680 assert(h==8);
3681 s->mb_intra=0;
3683 s->dsp.diff_pixels(temp, src1, src2, stride);
3685 memcpy(bak, temp, 64*sizeof(DCTELEM));
3687 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3688 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3689 ff_simple_idct(temp); //FIXME
3691 for(i=0; i<64; i++)
3692 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3694 return sum;
3697 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3698 MpegEncContext * const s= (MpegEncContext *)c;
3699 const uint8_t *scantable= s->intra_scantable.permutated;
3700 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3701 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3702 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3703 uint8_t * const bak= (uint8_t*)aligned_bak;
3704 int i, last, run, bits, level, distortion, start_i;
3705 const int esc_length= s->ac_esc_length;
3706 uint8_t * length;
3707 uint8_t * last_length;
3709 assert(h==8);
3711 for(i=0; i<8; i++){
3712 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3713 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3716 s->dsp.diff_pixels(temp, src1, src2, stride);
3718 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3720 bits=0;
3722 if (s->mb_intra) {
3723 start_i = 1;
3724 length = s->intra_ac_vlc_length;
3725 last_length= s->intra_ac_vlc_last_length;
3726 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3727 } else {
3728 start_i = 0;
3729 length = s->inter_ac_vlc_length;
3730 last_length= s->inter_ac_vlc_last_length;
3733 if(last>=start_i){
3734 run=0;
3735 for(i=start_i; i<last; i++){
3736 int j= scantable[i];
3737 level= temp[j];
3739 if(level){
3740 level+=64;
3741 if((level&(~127)) == 0){
3742 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3743 }else
3744 bits+= esc_length;
3745 run=0;
3746 }else
3747 run++;
3749 i= scantable[last];
3751 level= temp[i] + 64;
3753 assert(level - 64);
3755 if((level&(~127)) == 0){
3756 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3757 }else
3758 bits+= esc_length;
3762 if(last>=0){
3763 if(s->mb_intra)
3764 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3765 else
3766 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3769 s->dsp.idct_add(bak, stride, temp);
3771 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3773 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3776 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3777 MpegEncContext * const s= (MpegEncContext *)c;
3778 const uint8_t *scantable= s->intra_scantable.permutated;
3779 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3780 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3781 int i, last, run, bits, level, start_i;
3782 const int esc_length= s->ac_esc_length;
3783 uint8_t * length;
3784 uint8_t * last_length;
3786 assert(h==8);
3788 s->dsp.diff_pixels(temp, src1, src2, stride);
3790 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3792 bits=0;
3794 if (s->mb_intra) {
3795 start_i = 1;
3796 length = s->intra_ac_vlc_length;
3797 last_length= s->intra_ac_vlc_last_length;
3798 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3799 } else {
3800 start_i = 0;
3801 length = s->inter_ac_vlc_length;
3802 last_length= s->inter_ac_vlc_last_length;
3805 if(last>=start_i){
3806 run=0;
3807 for(i=start_i; i<last; i++){
3808 int j= scantable[i];
3809 level= temp[j];
3811 if(level){
3812 level+=64;
3813 if((level&(~127)) == 0){
3814 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3815 }else
3816 bits+= esc_length;
3817 run=0;
3818 }else
3819 run++;
3821 i= scantable[last];
3823 level= temp[i] + 64;
3825 assert(level - 64);
3827 if((level&(~127)) == 0){
3828 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3829 }else
3830 bits+= esc_length;
3833 return bits;
3836 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3837 int score=0;
3838 int x,y;
3840 for(y=1; y<h; y++){
3841 for(x=0; x<16; x+=4){
3842 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3843 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3845 s+= stride;
3848 return score;
3851 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3852 int score=0;
3853 int x,y;
3855 for(y=1; y<h; y++){
3856 for(x=0; x<16; x++){
3857 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3859 s1+= stride;
3860 s2+= stride;
3863 return score;
3866 #define SQ(a) ((a)*(a))
3867 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3868 int score=0;
3869 int x,y;
3871 for(y=1; y<h; y++){
3872 for(x=0; x<16; x+=4){
3873 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3874 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3876 s+= stride;
3879 return score;
3882 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3883 int score=0;
3884 int x,y;
3886 for(y=1; y<h; y++){
3887 for(x=0; x<16; x++){
3888 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3890 s1+= stride;
3891 s2+= stride;
3894 return score;
3897 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3898 int size){
3899 int score=0;
3900 int i;
3901 for(i=0; i<size; i++)
3902 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3903 return score;
3906 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3907 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3908 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3909 #ifdef CONFIG_GPL
3910 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3911 #endif
3912 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3913 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3914 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3915 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3917 static void vector_fmul_c(float *dst, const float *src, int len){
3918 int i;
3919 for(i=0; i<len; i++)
3920 dst[i] *= src[i];
3923 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3924 int i;
3925 src1 += len-1;
3926 for(i=0; i<len; i++)
3927 dst[i] = src0[i] * src1[-i];
3930 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3931 int i;
3932 for(i=0; i<len; i++)
3933 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3936 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3937 int i,j;
3938 dst += len;
3939 win += len;
3940 src0+= len;
3941 for(i=-len, j=len-1; i<0; i++, j--) {
3942 float s0 = src0[i];
3943 float s1 = src1[j];
3944 float wi = win[i];
3945 float wj = win[j];
3946 dst[i] = s0*wj - s1*wi + add_bias;
3947 dst[j] = s0*wi + s1*wj + add_bias;
3951 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3952 int i;
3953 for(i=0; i<len; i++)
3954 dst[i] = src[i] * mul;
3957 static av_always_inline int float_to_int16_one(const float *src){
3958 int_fast32_t tmp = *(const int32_t*)src;
3959 if(tmp & 0xf0000){
3960 tmp = (0x43c0ffff - tmp)>>31;
3961 // is this faster on some gcc/cpu combinations?
3962 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3963 // else tmp = 0;
3965 return tmp - 0x8000;
3968 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3969 int i;
3970 for(i=0; i<len; i++)
3971 dst[i] = float_to_int16_one(src+i);
3974 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3975 int i,j,c;
3976 if(channels==2){
3977 for(i=0; i<len; i++){
3978 dst[2*i] = float_to_int16_one(src[0]+i);
3979 dst[2*i+1] = float_to_int16_one(src[1]+i);
3981 }else{
3982 for(c=0; c<channels; c++)
3983 for(i=0, j=c; i<len; i++, j+=channels)
3984 dst[j] = float_to_int16_one(src[c]+i);
3988 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
3990 while (order--)
3991 *v1++ += *v2++;
3994 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
3996 while (order--)
3997 *v1++ -= *v2++;
4000 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4002 int res = 0;
4004 while (order--)
4005 res += (*v1++ * *v2++) >> shift;
4007 return res;
4010 #define W0 2048
4011 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4012 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4013 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4014 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4015 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4016 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4017 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4019 static void wmv2_idct_row(short * b)
4021 int s1,s2;
4022 int a0,a1,a2,a3,a4,a5,a6,a7;
4023 /*step 1*/
4024 a1 = W1*b[1]+W7*b[7];
4025 a7 = W7*b[1]-W1*b[7];
4026 a5 = W5*b[5]+W3*b[3];
4027 a3 = W3*b[5]-W5*b[3];
4028 a2 = W2*b[2]+W6*b[6];
4029 a6 = W6*b[2]-W2*b[6];
4030 a0 = W0*b[0]+W0*b[4];
4031 a4 = W0*b[0]-W0*b[4];
4032 /*step 2*/
4033 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4034 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4035 /*step 3*/
4036 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4037 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4038 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4039 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4040 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4041 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4042 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4043 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4045 static void wmv2_idct_col(short * b)
4047 int s1,s2;
4048 int a0,a1,a2,a3,a4,a5,a6,a7;
4049 /*step 1, with extended precision*/
4050 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4051 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4052 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4053 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4054 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4055 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4056 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4057 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4058 /*step 2*/
4059 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4060 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4061 /*step 3*/
4062 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4063 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4064 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4065 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4067 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4068 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4069 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4070 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4072 void ff_wmv2_idct_c(short * block){
4073 int i;
4075 for(i=0;i<64;i+=8){
4076 wmv2_idct_row(block+i);
4078 for(i=0;i<8;i++){
4079 wmv2_idct_col(block+i);
4082 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4083 converted */
4084 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4086 ff_wmv2_idct_c(block);
4087 put_pixels_clamped_c(block, dest, line_size);
4089 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4091 ff_wmv2_idct_c(block);
4092 add_pixels_clamped_c(block, dest, line_size);
4094 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4096 j_rev_dct (block);
4097 put_pixels_clamped_c(block, dest, line_size);
4099 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4101 j_rev_dct (block);
4102 add_pixels_clamped_c(block, dest, line_size);
4105 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4107 j_rev_dct4 (block);
4108 put_pixels_clamped4_c(block, dest, line_size);
4110 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4112 j_rev_dct4 (block);
4113 add_pixels_clamped4_c(block, dest, line_size);
4116 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4118 j_rev_dct2 (block);
4119 put_pixels_clamped2_c(block, dest, line_size);
4121 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4123 j_rev_dct2 (block);
4124 add_pixels_clamped2_c(block, dest, line_size);
4127 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4129 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4131 dest[0] = cm[(block[0] + 4)>>3];
4133 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4135 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4137 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4140 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4142 /* init static data */
4143 void dsputil_static_init(void)
4145 int i;
4147 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4148 for(i=0;i<MAX_NEG_CROP;i++) {
4149 ff_cropTbl[i] = 0;
4150 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4153 for(i=0;i<512;i++) {
4154 ff_squareTbl[i] = (i - 256) * (i - 256);
4157 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4160 int ff_check_alignment(void){
4161 static int did_fail=0;
4162 DECLARE_ALIGNED_16(int, aligned);
4164 if((long)&aligned & 15){
4165 if(!did_fail){
4166 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4167 av_log(NULL, AV_LOG_ERROR,
4168 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4169 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4170 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4171 "Do not report crashes to FFmpeg developers.\n");
4172 #endif
4173 did_fail=1;
4175 return -1;
4177 return 0;
4180 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4182 int i;
4184 ff_check_alignment();
4186 #ifdef CONFIG_ENCODERS
4187 if(avctx->dct_algo==FF_DCT_FASTINT) {
4188 c->fdct = fdct_ifast;
4189 c->fdct248 = fdct_ifast248;
4191 else if(avctx->dct_algo==FF_DCT_FAAN) {
4192 c->fdct = ff_faandct;
4193 c->fdct248 = ff_faandct248;
4195 else {
4196 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4197 c->fdct248 = ff_fdct248_islow;
4199 #endif //CONFIG_ENCODERS
4201 if(avctx->lowres==1){
4202 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4203 c->idct_put= ff_jref_idct4_put;
4204 c->idct_add= ff_jref_idct4_add;
4205 }else{
4206 c->idct_put= ff_h264_lowres_idct_put_c;
4207 c->idct_add= ff_h264_lowres_idct_add_c;
4209 c->idct = j_rev_dct4;
4210 c->idct_permutation_type= FF_NO_IDCT_PERM;
4211 }else if(avctx->lowres==2){
4212 c->idct_put= ff_jref_idct2_put;
4213 c->idct_add= ff_jref_idct2_add;
4214 c->idct = j_rev_dct2;
4215 c->idct_permutation_type= FF_NO_IDCT_PERM;
4216 }else if(avctx->lowres==3){
4217 c->idct_put= ff_jref_idct1_put;
4218 c->idct_add= ff_jref_idct1_add;
4219 c->idct = j_rev_dct1;
4220 c->idct_permutation_type= FF_NO_IDCT_PERM;
4221 }else{
4222 if(avctx->idct_algo==FF_IDCT_INT){
4223 c->idct_put= ff_jref_idct_put;
4224 c->idct_add= ff_jref_idct_add;
4225 c->idct = j_rev_dct;
4226 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4227 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4228 avctx->idct_algo==FF_IDCT_VP3){
4229 c->idct_put= ff_vp3_idct_put_c;
4230 c->idct_add= ff_vp3_idct_add_c;
4231 c->idct = ff_vp3_idct_c;
4232 c->idct_permutation_type= FF_NO_IDCT_PERM;
4233 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4234 c->idct_put= ff_wmv2_idct_put_c;
4235 c->idct_add= ff_wmv2_idct_add_c;
4236 c->idct = ff_wmv2_idct_c;
4237 c->idct_permutation_type= FF_NO_IDCT_PERM;
4238 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4239 c->idct_put= ff_faanidct_put;
4240 c->idct_add= ff_faanidct_add;
4241 c->idct = ff_faanidct;
4242 c->idct_permutation_type= FF_NO_IDCT_PERM;
4243 }else{ //accurate/default
4244 c->idct_put= ff_simple_idct_put;
4245 c->idct_add= ff_simple_idct_add;
4246 c->idct = ff_simple_idct;
4247 c->idct_permutation_type= FF_NO_IDCT_PERM;
4251 if (ENABLE_H264_DECODER) {
4252 c->h264_idct_add= ff_h264_idct_add_c;
4253 c->h264_idct8_add= ff_h264_idct8_add_c;
4254 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4255 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4258 c->get_pixels = get_pixels_c;
4259 c->diff_pixels = diff_pixels_c;
4260 c->put_pixels_clamped = put_pixels_clamped_c;
4261 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4262 c->add_pixels_clamped = add_pixels_clamped_c;
4263 c->add_pixels8 = add_pixels8_c;
4264 c->add_pixels4 = add_pixels4_c;
4265 c->sum_abs_dctelem = sum_abs_dctelem_c;
4266 c->gmc1 = gmc1_c;
4267 c->gmc = ff_gmc_c;
4268 c->clear_blocks = clear_blocks_c;
4269 c->pix_sum = pix_sum_c;
4270 c->pix_norm1 = pix_norm1_c;
4272 /* TODO [0] 16 [1] 8 */
4273 c->pix_abs[0][0] = pix_abs16_c;
4274 c->pix_abs[0][1] = pix_abs16_x2_c;
4275 c->pix_abs[0][2] = pix_abs16_y2_c;
4276 c->pix_abs[0][3] = pix_abs16_xy2_c;
4277 c->pix_abs[1][0] = pix_abs8_c;
4278 c->pix_abs[1][1] = pix_abs8_x2_c;
4279 c->pix_abs[1][2] = pix_abs8_y2_c;
4280 c->pix_abs[1][3] = pix_abs8_xy2_c;
4282 #define dspfunc(PFX, IDX, NUM) \
4283 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4284 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4285 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4286 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4288 dspfunc(put, 0, 16);
4289 dspfunc(put_no_rnd, 0, 16);
4290 dspfunc(put, 1, 8);
4291 dspfunc(put_no_rnd, 1, 8);
4292 dspfunc(put, 2, 4);
4293 dspfunc(put, 3, 2);
4295 dspfunc(avg, 0, 16);
4296 dspfunc(avg_no_rnd, 0, 16);
4297 dspfunc(avg, 1, 8);
4298 dspfunc(avg_no_rnd, 1, 8);
4299 dspfunc(avg, 2, 4);
4300 dspfunc(avg, 3, 2);
4301 #undef dspfunc
4303 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4304 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4306 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4307 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4308 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4309 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4310 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4311 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4312 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4313 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4314 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4316 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4317 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4318 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4319 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4320 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4321 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4322 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4323 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4324 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4326 #define dspfunc(PFX, IDX, NUM) \
4327 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4328 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4329 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4330 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4331 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4332 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4333 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4334 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4335 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4336 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4337 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4338 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4339 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4340 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4341 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4342 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4344 dspfunc(put_qpel, 0, 16);
4345 dspfunc(put_no_rnd_qpel, 0, 16);
4347 dspfunc(avg_qpel, 0, 16);
4348 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4350 dspfunc(put_qpel, 1, 8);
4351 dspfunc(put_no_rnd_qpel, 1, 8);
4353 dspfunc(avg_qpel, 1, 8);
4354 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4356 dspfunc(put_h264_qpel, 0, 16);
4357 dspfunc(put_h264_qpel, 1, 8);
4358 dspfunc(put_h264_qpel, 2, 4);
4359 dspfunc(put_h264_qpel, 3, 2);
4360 dspfunc(avg_h264_qpel, 0, 16);
4361 dspfunc(avg_h264_qpel, 1, 8);
4362 dspfunc(avg_h264_qpel, 2, 4);
4364 #undef dspfunc
4365 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4366 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4367 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4368 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4369 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4370 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4371 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4373 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4374 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4375 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4376 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4377 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4378 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4379 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4380 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4381 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4382 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4383 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4384 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4385 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4386 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4387 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4388 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4389 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4390 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4391 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4392 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4394 c->draw_edges = draw_edges_c;
4396 #ifdef CONFIG_CAVS_DECODER
4397 ff_cavsdsp_init(c,avctx);
4398 #endif
4399 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4400 ff_vc1dsp_init(c,avctx);
4401 #endif
4402 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4403 ff_intrax8dsp_init(c,avctx);
4404 #endif
4405 #if defined(CONFIG_H264_ENCODER)
4406 ff_h264dspenc_init(c,avctx);
4407 #endif
4409 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4410 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4411 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4412 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4413 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4414 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4415 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4416 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4418 #define SET_CMP_FUNC(name) \
4419 c->name[0]= name ## 16_c;\
4420 c->name[1]= name ## 8x8_c;
4422 SET_CMP_FUNC(hadamard8_diff)
4423 c->hadamard8_diff[4]= hadamard8_intra16_c;
4424 SET_CMP_FUNC(dct_sad)
4425 SET_CMP_FUNC(dct_max)
4426 #ifdef CONFIG_GPL
4427 SET_CMP_FUNC(dct264_sad)
4428 #endif
4429 c->sad[0]= pix_abs16_c;
4430 c->sad[1]= pix_abs8_c;
4431 c->sse[0]= sse16_c;
4432 c->sse[1]= sse8_c;
4433 c->sse[2]= sse4_c;
4434 SET_CMP_FUNC(quant_psnr)
4435 SET_CMP_FUNC(rd)
4436 SET_CMP_FUNC(bit)
4437 c->vsad[0]= vsad16_c;
4438 c->vsad[4]= vsad_intra16_c;
4439 c->vsse[0]= vsse16_c;
4440 c->vsse[4]= vsse_intra16_c;
4441 c->nsse[0]= nsse16_c;
4442 c->nsse[1]= nsse8_c;
4443 #ifdef CONFIG_SNOW_ENCODER
4444 c->w53[0]= w53_16_c;
4445 c->w53[1]= w53_8_c;
4446 c->w97[0]= w97_16_c;
4447 c->w97[1]= w97_8_c;
4448 #endif
4450 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4452 c->add_bytes= add_bytes_c;
4453 c->add_bytes_l2= add_bytes_l2_c;
4454 c->diff_bytes= diff_bytes_c;
4455 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4456 c->bswap_buf= bswap_buf;
4457 #ifdef CONFIG_PNG_DECODER
4458 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4459 #endif
4461 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4462 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4463 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4464 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4465 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4466 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4467 c->h264_loop_filter_strength= NULL;
4469 if (ENABLE_ANY_H263) {
4470 c->h263_h_loop_filter= h263_h_loop_filter_c;
4471 c->h263_v_loop_filter= h263_v_loop_filter_c;
4474 c->h261_loop_filter= h261_loop_filter_c;
4476 c->try_8x8basis= try_8x8basis_c;
4477 c->add_8x8basis= add_8x8basis_c;
4479 #ifdef CONFIG_SNOW_DECODER
4480 c->vertical_compose97i = ff_snow_vertical_compose97i;
4481 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4482 c->inner_add_yblock = ff_snow_inner_add_yblock;
4483 #endif
4485 #ifdef CONFIG_VORBIS_DECODER
4486 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4487 #endif
4488 #ifdef CONFIG_AC3_DECODER
4489 c->ac3_downmix = ff_ac3_downmix_c;
4490 #endif
4491 #ifdef CONFIG_FLAC_ENCODER
4492 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4493 #endif
4494 c->vector_fmul = vector_fmul_c;
4495 c->vector_fmul_reverse = vector_fmul_reverse_c;
4496 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4497 c->vector_fmul_window = ff_vector_fmul_window_c;
4498 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4499 c->float_to_int16 = ff_float_to_int16_c;
4500 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4501 c->add_int16 = add_int16_c;
4502 c->sub_int16 = sub_int16_c;
4503 c->scalarproduct_int16 = scalarproduct_int16_c;
4505 c->shrink[0]= ff_img_copy_plane;
4506 c->shrink[1]= ff_shrink22;
4507 c->shrink[2]= ff_shrink44;
4508 c->shrink[3]= ff_shrink88;
4510 c->prefetch= just_return;
4512 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4513 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4515 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4516 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4517 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4518 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4519 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4520 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4521 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4522 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4523 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4525 for(i=0; i<64; i++){
4526 if(!c->put_2tap_qpel_pixels_tab[0][i])
4527 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4528 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4529 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4532 switch(c->idct_permutation_type){
4533 case FF_NO_IDCT_PERM:
4534 for(i=0; i<64; i++)
4535 c->idct_permutation[i]= i;
4536 break;
4537 case FF_LIBMPEG2_IDCT_PERM:
4538 for(i=0; i<64; i++)
4539 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4540 break;
4541 case FF_SIMPLE_IDCT_PERM:
4542 for(i=0; i<64; i++)
4543 c->idct_permutation[i]= simple_mmx_permutation[i];
4544 break;
4545 case FF_TRANSPOSE_IDCT_PERM:
4546 for(i=0; i<64; i++)
4547 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4548 break;
4549 case FF_PARTTRANS_IDCT_PERM:
4550 for(i=0; i<64; i++)
4551 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4552 break;
4553 case FF_SSE2_IDCT_PERM:
4554 for(i=0; i<64; i++)
4555 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4556 break;
4557 default:
4558 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");