use lookup tables instead of actual exp/pow for AQ
[SFUResearch.git] / common / dct.c
blob1b525474aea1a81525b5fed33e53fe47b5bea056
1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
24 #include "common.h"
25 #ifdef HAVE_MMX
26 # include "x86/dct.h"
27 #endif
28 #ifdef ARCH_PPC
29 # include "ppc/dct.h"
30 #endif
32 int x264_dct4_weight2_zigzag[2][16];
33 int x264_dct8_weight2_zigzag[2][64];
36 * XXX For all dct dc : input could be equal to output so ...
39 static void dct4x4dc( int16_t d[4][4] )
41 int16_t tmp[4][4];
42 int s01, s23;
43 int d01, d23;
44 int i;
46 for( i = 0; i < 4; i++ )
48 s01 = d[i][0] + d[i][1];
49 d01 = d[i][0] - d[i][1];
50 s23 = d[i][2] + d[i][3];
51 d23 = d[i][2] - d[i][3];
53 tmp[0][i] = s01 + s23;
54 tmp[1][i] = s01 - s23;
55 tmp[2][i] = d01 - d23;
56 tmp[3][i] = d01 + d23;
59 for( i = 0; i < 4; i++ )
61 s01 = tmp[i][0] + tmp[i][1];
62 d01 = tmp[i][0] - tmp[i][1];
63 s23 = tmp[i][2] + tmp[i][3];
64 d23 = tmp[i][2] - tmp[i][3];
66 d[i][0] = ( s01 + s23 + 1 ) >> 1;
67 d[i][1] = ( s01 - s23 + 1 ) >> 1;
68 d[i][2] = ( d01 - d23 + 1 ) >> 1;
69 d[i][3] = ( d01 + d23 + 1 ) >> 1;
73 static void idct4x4dc( int16_t d[4][4] )
75 int16_t tmp[4][4];
76 int s01, s23;
77 int d01, d23;
78 int i;
80 for( i = 0; i < 4; i++ )
82 s01 = d[i][0] + d[i][1];
83 d01 = d[i][0] - d[i][1];
84 s23 = d[i][2] + d[i][3];
85 d23 = d[i][2] - d[i][3];
87 tmp[0][i] = s01 + s23;
88 tmp[1][i] = s01 - s23;
89 tmp[2][i] = d01 - d23;
90 tmp[3][i] = d01 + d23;
93 for( i = 0; i < 4; i++ )
95 s01 = tmp[i][0] + tmp[i][1];
96 d01 = tmp[i][0] - tmp[i][1];
97 s23 = tmp[i][2] + tmp[i][3];
98 d23 = tmp[i][2] - tmp[i][3];
100 d[i][0] = s01 + s23;
101 d[i][1] = s01 - s23;
102 d[i][2] = d01 - d23;
103 d[i][3] = d01 + d23;
107 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
108 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
110 int y, x;
111 for( y = 0; y < i_size; y++ )
113 for( x = 0; x < i_size; x++ )
115 diff[x + y*i_size] = pix1[x] - pix2[x];
117 pix1 += i_pix1;
118 pix2 += i_pix2;
122 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
124 int16_t d[4][4];
125 int16_t tmp[4][4];
126 int i;
128 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
130 for( i = 0; i < 4; i++ )
132 const int s03 = d[i][0] + d[i][3];
133 const int s12 = d[i][1] + d[i][2];
134 const int d03 = d[i][0] - d[i][3];
135 const int d12 = d[i][1] - d[i][2];
137 tmp[0][i] = s03 + s12;
138 tmp[1][i] = 2*d03 + d12;
139 tmp[2][i] = s03 - s12;
140 tmp[3][i] = d03 - 2*d12;
143 for( i = 0; i < 4; i++ )
145 const int s03 = tmp[i][0] + tmp[i][3];
146 const int s12 = tmp[i][1] + tmp[i][2];
147 const int d03 = tmp[i][0] - tmp[i][3];
148 const int d12 = tmp[i][1] - tmp[i][2];
150 dct[i][0] = s03 + s12;
151 dct[i][1] = 2*d03 + d12;
152 dct[i][2] = s03 - s12;
153 dct[i][3] = d03 - 2*d12;
157 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
159 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
160 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
161 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
162 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
165 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
167 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
168 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
169 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
170 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
174 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
176 int16_t d[4][4];
177 int16_t tmp[4][4];
178 int x, y;
179 int i;
181 for( i = 0; i < 4; i++ )
183 const int s02 = dct[0][i] + dct[2][i];
184 const int d02 = dct[0][i] - dct[2][i];
185 const int s13 = dct[1][i] + (dct[3][i]>>1);
186 const int d13 = (dct[1][i]>>1) - dct[3][i];
188 tmp[i][0] = s02 + s13;
189 tmp[i][1] = d02 + d13;
190 tmp[i][2] = d02 - d13;
191 tmp[i][3] = s02 - s13;
194 for( i = 0; i < 4; i++ )
196 const int s02 = tmp[0][i] + tmp[2][i];
197 const int d02 = tmp[0][i] - tmp[2][i];
198 const int s13 = tmp[1][i] + (tmp[3][i]>>1);
199 const int d13 = (tmp[1][i]>>1) - tmp[3][i];
201 d[0][i] = ( s02 + s13 + 32 ) >> 6;
202 d[1][i] = ( d02 + d13 + 32 ) >> 6;
203 d[2][i] = ( d02 - d13 + 32 ) >> 6;
204 d[3][i] = ( s02 - s13 + 32 ) >> 6;
208 for( y = 0; y < 4; y++ )
210 for( x = 0; x < 4; x++ )
212 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
214 p_dst += FDEC_STRIDE;
218 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
220 add4x4_idct( &p_dst[0], dct[0] );
221 add4x4_idct( &p_dst[4], dct[1] );
222 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
223 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
226 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
228 add8x8_idct( &p_dst[0], &dct[0] );
229 add8x8_idct( &p_dst[8], &dct[4] );
230 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
231 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
234 /****************************************************************************
235 * 8x8 transform:
236 ****************************************************************************/
238 #define DCT8_1D {\
239 const int s07 = SRC(0) + SRC(7);\
240 const int s16 = SRC(1) + SRC(6);\
241 const int s25 = SRC(2) + SRC(5);\
242 const int s34 = SRC(3) + SRC(4);\
243 const int a0 = s07 + s34;\
244 const int a1 = s16 + s25;\
245 const int a2 = s07 - s34;\
246 const int a3 = s16 - s25;\
247 const int d07 = SRC(0) - SRC(7);\
248 const int d16 = SRC(1) - SRC(6);\
249 const int d25 = SRC(2) - SRC(5);\
250 const int d34 = SRC(3) - SRC(4);\
251 const int a4 = d16 + d25 + (d07 + (d07>>1));\
252 const int a5 = d07 - d34 - (d25 + (d25>>1));\
253 const int a6 = d07 + d34 - (d16 + (d16>>1));\
254 const int a7 = d16 - d25 + (d34 + (d34>>1));\
255 DST(0) = a0 + a1 ;\
256 DST(1) = a4 + (a7>>2);\
257 DST(2) = a2 + (a3>>1);\
258 DST(3) = a5 + (a6>>2);\
259 DST(4) = a0 - a1 ;\
260 DST(5) = a6 - (a5>>2);\
261 DST(6) = (a2>>1) - a3 ;\
262 DST(7) = (a4>>2) - a7 ;\
265 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
267 int i;
268 int16_t tmp[8][8];
270 pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
272 #define SRC(x) tmp[x][i]
273 #define DST(x) tmp[x][i]
274 for( i = 0; i < 8; i++ )
275 DCT8_1D
276 #undef SRC
277 #undef DST
279 #define SRC(x) tmp[i][x]
280 #define DST(x) dct[x][i]
281 for( i = 0; i < 8; i++ )
282 DCT8_1D
283 #undef SRC
284 #undef DST
287 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
289 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
290 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
291 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
292 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
295 #define IDCT8_1D {\
296 const int a0 = SRC(0) + SRC(4);\
297 const int a2 = SRC(0) - SRC(4);\
298 const int a4 = (SRC(2)>>1) - SRC(6);\
299 const int a6 = (SRC(6)>>1) + SRC(2);\
300 const int b0 = a0 + a6;\
301 const int b2 = a2 + a4;\
302 const int b4 = a2 - a4;\
303 const int b6 = a0 - a6;\
304 const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
305 const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
306 const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
307 const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
308 const int b1 = (a7>>2) + a1;\
309 const int b3 = a3 + (a5>>2);\
310 const int b5 = (a3>>2) - a5;\
311 const int b7 = a7 - (a1>>2);\
312 DST(0, b0 + b7);\
313 DST(1, b2 + b5);\
314 DST(2, b4 + b3);\
315 DST(3, b6 + b1);\
316 DST(4, b6 - b1);\
317 DST(5, b4 - b3);\
318 DST(6, b2 - b5);\
319 DST(7, b0 - b7);\
322 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
324 int i;
326 dct[0][0] += 32; // rounding for the >>6 at the end
328 #define SRC(x) dct[x][i]
329 #define DST(x,rhs) dct[x][i] = (rhs)
330 for( i = 0; i < 8; i++ )
331 IDCT8_1D
332 #undef SRC
333 #undef DST
335 #define SRC(x) dct[i][x]
336 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
337 for( i = 0; i < 8; i++ )
338 IDCT8_1D
339 #undef SRC
340 #undef DST
343 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
345 add8x8_idct8( &dst[0], dct[0] );
346 add8x8_idct8( &dst[8], dct[1] );
347 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
348 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
352 /****************************************************************************
353 * x264_dct_init:
354 ****************************************************************************/
355 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
357 dctf->sub4x4_dct = sub4x4_dct;
358 dctf->add4x4_idct = add4x4_idct;
360 dctf->sub8x8_dct = sub8x8_dct;
361 dctf->add8x8_idct = add8x8_idct;
363 dctf->sub16x16_dct = sub16x16_dct;
364 dctf->add16x16_idct = add16x16_idct;
366 dctf->sub8x8_dct8 = sub8x8_dct8;
367 dctf->add8x8_idct8 = add8x8_idct8;
369 dctf->sub16x16_dct8 = sub16x16_dct8;
370 dctf->add16x16_idct8 = add16x16_idct8;
372 dctf->dct4x4dc = dct4x4dc;
373 dctf->idct4x4dc = idct4x4dc;
375 #ifdef HAVE_MMX
376 if( cpu&X264_CPU_MMX )
378 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
379 dctf->add4x4_idct = x264_add4x4_idct_mmx;
380 dctf->dct4x4dc = x264_dct4x4dc_mmx;
381 dctf->idct4x4dc = x264_idct4x4dc_mmx;
383 #ifndef ARCH_X86_64
384 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
385 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
386 dctf->add8x8_idct = x264_add8x8_idct_mmx;
387 dctf->add16x16_idct = x264_add16x16_idct_mmx;
389 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
390 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
391 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
392 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
393 #endif
396 if( cpu&X264_CPU_SSE2 )
398 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
399 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
400 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
401 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
403 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
404 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
405 dctf->add8x8_idct = x264_add8x8_idct_sse2;
406 dctf->add16x16_idct = x264_add16x16_idct_sse2;
408 #endif //HAVE_MMX
410 #ifdef ARCH_PPC
411 if( cpu&X264_CPU_ALTIVEC )
413 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
414 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
415 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
417 dctf->add4x4_idct = x264_add4x4_idct_altivec;
418 dctf->add8x8_idct = x264_add8x8_idct_altivec;
419 dctf->add16x16_idct = x264_add16x16_idct_altivec;
421 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
422 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
424 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
425 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
427 #endif
430 void x264_dct_init_weights( void )
432 int i, j;
433 for( j=0; j<2; j++ )
435 for( i=0; i<16; i++ )
436 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
437 for( i=0; i<64; i++ )
438 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
443 // gcc pessimizes multi-dimensional arrays here, even with constant indices
444 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
445 #define ZIGZAG8_FRAME\
446 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
447 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
448 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
449 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
450 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
451 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
452 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
453 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
454 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
455 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
456 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
457 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
458 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
459 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
460 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
461 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
463 #define ZIGZAG8_FIELD\
464 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
465 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
466 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
467 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
468 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
469 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
470 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
471 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
472 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
473 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
474 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
475 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
476 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
477 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
478 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
479 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
481 #define ZIGZAG4_FRAME\
482 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
483 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
484 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
485 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
487 #define ZIGZAG4_FIELD\
488 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
489 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
490 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
491 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
493 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
495 ZIGZAG8_FRAME
498 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
500 ZIGZAG8_FIELD
503 #undef ZIG
504 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
506 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
508 ZIGZAG4_FRAME
511 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
513 *(uint32_t*)level = *(uint32_t*)dct;
514 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
515 *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
516 *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
517 *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
520 #undef ZIG
521 #define ZIG(i,y,x) {\
522 int oe = x+y*FENC_STRIDE;\
523 int od = x+y*FDEC_STRIDE;\
524 level[i] = p_src[oe] - p_dst[od];\
526 #define COPY4x4\
527 *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
528 *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
529 *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
530 *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
531 #define COPY8x8\
532 *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
533 *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
534 *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
535 *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
536 *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
537 *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
538 *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
539 *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
541 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
543 ZIGZAG4_FRAME
544 COPY4x4
547 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
549 ZIGZAG4_FIELD
550 COPY4x4
553 static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
555 ZIGZAG8_FRAME
556 COPY8x8
558 static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
560 ZIGZAG8_FIELD
561 COPY8x8
564 #undef ZIG
565 #undef COPY4x4
567 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
569 int i,j;
570 for( i=0; i<4; i++ )
571 for( j=0; j<16; j++ )
572 dst[i*16+j] = src[i+j*4];
575 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
577 if( b_interlaced )
579 pf->scan_8x8 = zigzag_scan_8x8_field;
580 pf->scan_4x4 = zigzag_scan_4x4_field;
581 pf->sub_8x8 = zigzag_sub_8x8_field;
582 pf->sub_4x4 = zigzag_sub_4x4_field;
583 #ifdef HAVE_MMX
584 if( cpu&X264_CPU_MMXEXT )
585 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
586 #endif
588 #ifdef ARCH_PPC
589 if( cpu&X264_CPU_ALTIVEC )
590 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
591 #endif
593 else
595 pf->scan_8x8 = zigzag_scan_8x8_frame;
596 pf->scan_4x4 = zigzag_scan_4x4_frame;
597 pf->sub_8x8 = zigzag_sub_8x8_frame;
598 pf->sub_4x4 = zigzag_sub_4x4_frame;
599 #ifdef HAVE_MMX
600 if( cpu&X264_CPU_MMX )
601 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
602 if( cpu&X264_CPU_MMXEXT )
603 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
604 if( cpu&X264_CPU_SSE2_IS_FAST )
605 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
606 if( cpu&X264_CPU_SSSE3 )
608 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
609 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
611 if( cpu&X264_CPU_PHADD_IS_FAST )
612 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
613 #endif
615 #ifdef ARCH_PPC
616 if( cpu&X264_CPU_ALTIVEC )
617 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
618 #endif
621 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
622 #ifdef HAVE_MMX
623 if( cpu&X264_CPU_MMX )
624 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
625 #endif