common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31
  32 int x264_dct4_weight2_zigzag[2][16];
  33 int x264_dct8_weight2_zigzag[2][64];
  34
  35 /*
  36  * XXX For all dct dc : input could be equal to output so ...
  37  */
  38
  39 static void dct4x4dc( int16_t d[4][4] )
  40 {
  41     int16_t tmp[4][4];
  42     int s01, s23;
  43     int d01, d23;
  44     int i;
  45
  46     for( i = 0; i < 4; i++ )
  47     {
  48         s01 = d[i][0] + d[i][1];
  49         d01 = d[i][0] - d[i][1];
  50         s23 = d[i][2] + d[i][3];
  51         d23 = d[i][2] - d[i][3];
  52
  53         tmp[0][i] = s01 + s23;
  54         tmp[1][i] = s01 - s23;
  55         tmp[2][i] = d01 - d23;
  56         tmp[3][i] = d01 + d23;
  57     }
  58
  59     for( i = 0; i < 4; i++ )
  60     {
  61         s01 = tmp[i][0] + tmp[i][1];
  62         d01 = tmp[i][0] - tmp[i][1];
  63         s23 = tmp[i][2] + tmp[i][3];
  64         d23 = tmp[i][2] - tmp[i][3];
  65
  66         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  67         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  68         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  69         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  70     }
  71 }
  72
  73 static void idct4x4dc( int16_t d[4][4] )
  74 {
  75     int16_t tmp[4][4];
  76     int s01, s23;
  77     int d01, d23;
  78     int i;
  79
  80     for( i = 0; i < 4; i++ )
  81     {
  82         s01 = d[i][0] + d[i][1];
  83         d01 = d[i][0] - d[i][1];
  84         s23 = d[i][2] + d[i][3];
  85         d23 = d[i][2] - d[i][3];
  86
  87         tmp[0][i] = s01 + s23;
  88         tmp[1][i] = s01 - s23;
  89         tmp[2][i] = d01 - d23;
  90         tmp[3][i] = d01 + d23;
  91     }
  92
  93     for( i = 0; i < 4; i++ )
  94     {
  95         s01 = tmp[i][0] + tmp[i][1];
  96         d01 = tmp[i][0] - tmp[i][1];
  97         s23 = tmp[i][2] + tmp[i][3];
  98         d23 = tmp[i][2] - tmp[i][3];
  99
 100         d[i][0] = s01 + s23;
 101         d[i][1] = s01 - s23;
 102         d[i][2] = d01 - d23;
 103         d[i][3] = d01 + d23;
 104     }
 105 }
 106
 107 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 108                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 109 {
 110     int y, x;
 111     for( y = 0; y < i_size; y++ )
 112     {
 113         for( x = 0; x < i_size; x++ )
 114         {
 115             diff[x + y*i_size] = pix1[x] - pix2[x];
 116         }
 117         pix1 += i_pix1;
 118         pix2 += i_pix2;
 119     }
 120 }
 121
 122 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 123 {
 124     int16_t d[4][4];
 125     int16_t tmp[4][4];
 126     int i;
 127
 128     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 129
 130     for( i = 0; i < 4; i++ )
 131     {
 132         const int s03 = d[i][0] + d[i][3];
 133         const int s12 = d[i][1] + d[i][2];
 134         const int d03 = d[i][0] - d[i][3];
 135         const int d12 = d[i][1] - d[i][2];
 136
 137         tmp[0][i] =   s03 +   s12;
 138         tmp[1][i] = 2*d03 +   d12;
 139         tmp[2][i] =   s03 -   s12;
 140         tmp[3][i] =   d03 - 2*d12;
 141     }
 142
 143     for( i = 0; i < 4; i++ )
 144     {
 145         const int s03 = tmp[i][0] + tmp[i][3];
 146         const int s12 = tmp[i][1] + tmp[i][2];
 147         const int d03 = tmp[i][0] - tmp[i][3];
 148         const int d12 = tmp[i][1] - tmp[i][2];
 149
 150         dct[i][0] =   s03 +   s12;
 151         dct[i][1] = 2*d03 +   d12;
 152         dct[i][2] =   s03 -   s12;
 153         dct[i][3] =   d03 - 2*d12;
 154     }
 155 }
 156
 157 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 158 {
 159     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 160     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 161     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 162     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 163 }
 164
 165 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 166 {
 167     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 168     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 169     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 170     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 171 }
 172
 173
 174 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 175 {
 176     int16_t d[4][4];
 177     int16_t tmp[4][4];
 178     int x, y;
 179     int i;
 180
 181     for( i = 0; i < 4; i++ )
 182     {
 183         const int s02 =  dct[0][i]     +  dct[2][i];
 184         const int d02 =  dct[0][i]     -  dct[2][i];
 185         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 186         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 187
 188         tmp[i][0] = s02 + s13;
 189         tmp[i][1] = d02 + d13;
 190         tmp[i][2] = d02 - d13;
 191         tmp[i][3] = s02 - s13;
 192     }
 193
 194     for( i = 0; i < 4; i++ )
 195     {
 196         const int s02 =  tmp[0][i]     +  tmp[2][i];
 197         const int d02 =  tmp[0][i]     -  tmp[2][i];
 198         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 199         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 200
 201         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 202         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 203         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 204         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 205     }
 206
 207
 208     for( y = 0; y < 4; y++ )
 209     {
 210         for( x = 0; x < 4; x++ )
 211         {
 212             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
 213         }
 214         p_dst += FDEC_STRIDE;
 215     }
 216 }
 217
 218 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 219 {
 220     add4x4_idct( &p_dst[0],               dct[0] );
 221     add4x4_idct( &p_dst[4],               dct[1] );
 222     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 223     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 224 }
 225
 226 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 227 {
 228     add8x8_idct( &p_dst[0],               &dct[0] );
 229     add8x8_idct( &p_dst[8],               &dct[4] );
 230     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 231     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 232 }
 233
 234 /****************************************************************************
 235  * 8x8 transform:
 236  ****************************************************************************/
 237
 238 #define DCT8_1D {\
 239     const int s07 = SRC(0) + SRC(7);\
 240     const int s16 = SRC(1) + SRC(6);\
 241     const int s25 = SRC(2) + SRC(5);\
 242     const int s34 = SRC(3) + SRC(4);\
 243     const int a0 = s07 + s34;\
 244     const int a1 = s16 + s25;\
 245     const int a2 = s07 - s34;\
 246     const int a3 = s16 - s25;\
 247     const int d07 = SRC(0) - SRC(7);\
 248     const int d16 = SRC(1) - SRC(6);\
 249     const int d25 = SRC(2) - SRC(5);\
 250     const int d34 = SRC(3) - SRC(4);\
 251     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 252     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 253     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 254     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 255     DST(0) =  a0 + a1     ;\
 256     DST(1) =  a4 + (a7>>2);\
 257     DST(2) =  a2 + (a3>>1);\
 258     DST(3) =  a5 + (a6>>2);\
 259     DST(4) =  a0 - a1     ;\
 260     DST(5) =  a6 - (a5>>2);\
 261     DST(6) = (a2>>1) - a3 ;\
 262     DST(7) = (a4>>2) - a7 ;\
 263 }
 264
 265 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 266 {
 267     int i;
 268     int16_t tmp[8][8];
 269
 270     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 271
 272 #define SRC(x) tmp[x][i]
 273 #define DST(x) tmp[x][i]
 274     for( i = 0; i < 8; i++ )
 275         DCT8_1D
 276 #undef SRC
 277 #undef DST
 278
 279 #define SRC(x) tmp[i][x]
 280 #define DST(x) dct[x][i]
 281     for( i = 0; i < 8; i++ )
 282         DCT8_1D
 283 #undef SRC
 284 #undef DST
 285 }
 286
 287 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 288 {
 289     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 290     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 291     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 292     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 293 }
 294
 295 #define IDCT8_1D {\
 296     const int a0 =  SRC(0) + SRC(4);\
 297     const int a2 =  SRC(0) - SRC(4);\
 298     const int a4 = (SRC(2)>>1) - SRC(6);\
 299     const int a6 = (SRC(6)>>1) + SRC(2);\
 300     const int b0 = a0 + a6;\
 301     const int b2 = a2 + a4;\
 302     const int b4 = a2 - a4;\
 303     const int b6 = a0 - a6;\
 304     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 305     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 306     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 307     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 308     const int b1 = (a7>>2) + a1;\
 309     const int b3 =  a3 + (a5>>2);\
 310     const int b5 = (a3>>2) - a5;\
 311     const int b7 =  a7 - (a1>>2);\
 312     DST(0, b0 + b7);\
 313     DST(1, b2 + b5);\
 314     DST(2, b4 + b3);\
 315     DST(3, b6 + b1);\
 316     DST(4, b6 - b1);\
 317     DST(5, b4 - b3);\
 318     DST(6, b2 - b5);\
 319     DST(7, b0 - b7);\
 320 }
 321
 322 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 323 {
 324     int i;
 325
 326     dct[0][0] += 32; // rounding for the >>6 at the end
 327
 328 #define SRC(x)     dct[x][i]
 329 #define DST(x,rhs) dct[x][i] = (rhs)
 330     for( i = 0; i < 8; i++ )
 331         IDCT8_1D
 332 #undef SRC
 333 #undef DST
 334
 335 #define SRC(x)     dct[i][x]
 336 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 337     for( i = 0; i < 8; i++ )
 338         IDCT8_1D
 339 #undef SRC
 340 #undef DST
 341 }
 342
 343 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 344 {
 345     add8x8_idct8( &dst[0],               dct[0] );
 346     add8x8_idct8( &dst[8],               dct[1] );
 347     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 348     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 349 }
 350
 351
 352 /****************************************************************************
 353  * x264_dct_init:
 354  ****************************************************************************/
 355 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 356 {
 357     dctf->sub4x4_dct    = sub4x4_dct;
 358     dctf->add4x4_idct   = add4x4_idct;
 359
 360     dctf->sub8x8_dct    = sub8x8_dct;
 361     dctf->add8x8_idct   = add8x8_idct;
 362
 363     dctf->sub16x16_dct  = sub16x16_dct;
 364     dctf->add16x16_idct = add16x16_idct;
 365
 366     dctf->sub8x8_dct8   = sub8x8_dct8;
 367     dctf->add8x8_idct8  = add8x8_idct8;
 368
 369     dctf->sub16x16_dct8  = sub16x16_dct8;
 370     dctf->add16x16_idct8 = add16x16_idct8;
 371
 372     dctf->dct4x4dc  = dct4x4dc;
 373     dctf->idct4x4dc = idct4x4dc;
 374
 375 #ifdef HAVE_MMX
 376     if( cpu&X264_CPU_MMX )
 377     {
 378         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 379         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 380         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 381         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 382
 383 #ifndef ARCH_X86_64
 384         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 385         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 386         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 387         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 388
 389         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 390         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 391         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 392         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 393 #endif
 394     }
 395
 396     if( cpu&X264_CPU_SSE2 )
 397     {
 398         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 399         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 400         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 401         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 402
 403         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 404         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 405         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 406         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 407     }
 408 #endif //HAVE_MMX
 409
 410 #ifdef ARCH_PPC
 411     if( cpu&X264_CPU_ALTIVEC )
 412     {
 413         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 414         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 415         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 416
 417         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 418         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 419         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 420
 421         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 422         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 423
 424         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 425         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 426     }
 427 #endif
 428 }
 429
 430 void x264_dct_init_weights( void )
 431 {
 432     int i, j;
 433     for( j=0; j<2; j++ )
 434     {
 435         for( i=0; i<16; i++ )
 436             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 437         for( i=0; i<64; i++ )
 438             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 439     }
 440 }
 441
 442
 443 // gcc pessimizes multi-dimensional arrays here, even with constant indices
 444 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
 445 #define ZIGZAG8_FRAME\
 446     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 447     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 448     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 449     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 450     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 451     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 452     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 453     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 454     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 455     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 456     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 457     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 458     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 459     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 460     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 461     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 462
 463 #define ZIGZAG8_FIELD\
 464     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 465     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 466     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 467     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 468     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 469     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 470     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 471     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 472     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 473     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 474     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 475     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 476     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 477     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 478     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 479     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 480
 481 #define ZIGZAG4_FRAME\
 482     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 483     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 484     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 485     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 486
 487 #define ZIGZAG4_FIELD\
 488     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 489     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 490     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 491     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 492
 493 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
 494 {
 495     ZIGZAG8_FRAME
 496 }
 497
 498 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
 499 {
 500     ZIGZAG8_FIELD
 501 }
 502
 503 #undef ZIG
 504 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
 505
 506 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
 507 {
 508     ZIGZAG4_FRAME
 509 }
 510
 511 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 512 {
 513     *(uint32_t*)level = *(uint32_t*)dct;
 514     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 515     *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
 516     *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
 517     *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
 518 }
 519
 520 #undef ZIG
 521 #define ZIG(i,y,x) {\
 522     int oe = x+y*FENC_STRIDE;\
 523     int od = x+y*FDEC_STRIDE;\
 524     level[i] = p_src[oe] - p_dst[od];\
 525 }
 526 #define COPY4x4\
 527     *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
 528     *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
 529     *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
 530     *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
 531 #define COPY8x8\
 532     *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
 533     *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
 534     *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
 535     *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
 536     *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
 537     *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
 538     *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
 539     *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
 540
 541 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 542 {
 543     ZIGZAG4_FRAME
 544     COPY4x4
 545 }
 546
 547 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 548 {
 549     ZIGZAG4_FIELD
 550     COPY4x4
 551 }
 552
 553 static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 554 {
 555     ZIGZAG8_FRAME
 556     COPY8x8
 557 }
 558 static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 559 {
 560     ZIGZAG8_FIELD
 561     COPY8x8
 562 }
 563
 564 #undef ZIG
 565 #undef COPY4x4
 566
 567 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src )
 568 {
 569     int i,j;
 570     for( i=0; i<4; i++ )
 571         for( j=0; j<16; j++ )
 572             dst[i*16+j] = src[i+j*4];
 573 }
 574
 575 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 576 {
 577     if( b_interlaced )
 578     {
 579         pf->scan_8x8   = zigzag_scan_8x8_field;
 580         pf->scan_4x4   = zigzag_scan_4x4_field;
 581         pf->sub_8x8    = zigzag_sub_8x8_field;
 582         pf->sub_4x4    = zigzag_sub_4x4_field;
 583 #ifdef HAVE_MMX
 584         if( cpu&X264_CPU_MMXEXT )
 585             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 586 #endif
 587
 588 #ifdef ARCH_PPC
 589         if( cpu&X264_CPU_ALTIVEC )
 590             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 591 #endif
 592     }
 593     else
 594     {
 595         pf->scan_8x8   = zigzag_scan_8x8_frame;
 596         pf->scan_4x4   = zigzag_scan_4x4_frame;
 597         pf->sub_8x8    = zigzag_sub_8x8_frame;
 598         pf->sub_4x4    = zigzag_sub_4x4_frame;
 599 #ifdef HAVE_MMX
 600         if( cpu&X264_CPU_MMX )
 601             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 602         if( cpu&X264_CPU_MMXEXT )
 603             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 604         if( cpu&X264_CPU_SSE2_IS_FAST )
 605             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 606         if( cpu&X264_CPU_SSSE3 )
 607         {
 608             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 609             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 610         }
 611         if( cpu&X264_CPU_PHADD_IS_FAST )
 612             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 613 #endif
 614
 615 #ifdef ARCH_PPC
 616         if( cpu&X264_CPU_ALTIVEC )
 617             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 618 #endif
 619     }
 620
 621     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 622 #ifdef HAVE_MMX
 623     if( cpu&X264_CPU_MMX )
 624         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 625 #endif
 626 }