liba52/imdct.c

   1 /*
   2  * imdct.c
   3  * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
   4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   5  *
   6  * The ifft algorithms in this file have been largely inspired by Dan
   7  * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
   8  *
   9  * This file is part of a52dec, a free ATSC A-52 stream decoder.
  10  * See http://liba52.sourceforge.net/ for updates.
  11  *
  12  * Modified for use with MPlayer, changes contained in liba52_changes.diff.
  13  * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
  14  * $Id$
  15  *
  16  * a52dec is free software; you can redistribute it and/or modify
  17  * it under the terms of the GNU General Public License as published by
  18  * the Free Software Foundation; either version 2 of the License, or
  19  * (at your option) any later version.
  20  *
  21  * a52dec is distributed in the hope that it will be useful,
  22  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  23  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  24  * GNU General Public License for more details.
  25  *
  26  * You should have received a copy of the GNU General Public License
  27  * along with this program; if not, write to the Free Software
  28  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  29  *
  30  * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
  31  * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
  32  *   michael did port them from libac3 (untested, perhaps totally broken)
  33  * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
  34  */
  35
  36 #include "config.h"
  37
  38 #include <math.h>
  39 #include <stdio.h>
  40 #ifdef LIBA52_DJBFFT
  41 #include <fftc4.h>
  42 #endif
  43 #ifndef M_PI
  44 #define M_PI 3.1415926535897932384626433832795029
  45 #endif
  46 #include <inttypes.h>
  47
  48 #include "a52.h"
  49 #include "a52_internal.h"
  50 #include "mm_accel.h"
  51 #include "mangle.h"
  52
  53 void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
  54
  55 #if CONFIG_RUNTIME_CPUDETECT
  56 #undef HAVE_AMD3DNOWEXT
  57 #define HAVE_AMD3DNOWEXT 0
  58 #endif
  59
  60 typedef struct complex_s {
  61     sample_t real;
  62     sample_t imag;
  63 } complex_t;
  64
  65 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
  66 {
  67         0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
  68         4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
  69         2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
  70         6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
  71         1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
  72         5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
  73         3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
  74         7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
  75 };
  76
  77 static uint8_t attribute_used bit_reverse_512[] = {
  78         0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
  79         0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
  80         0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
  81         0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
  82         0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
  83         0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
  84         0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
  85         0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
  86         0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
  87         0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
  88         0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
  89         0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
  90         0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
  91         0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
  92         0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
  93         0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
  94
  95 static uint8_t fftorder[] = {
  96       0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
  97       8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
  98       4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
  99     252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
 100       2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
 101      10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
 102     254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
 103       6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
 104 };
 105
 106 static complex_t __attribute__((aligned(16))) buf[128];
 107
 108 /* Twiddle factor LUT */
 109 static complex_t __attribute__((aligned(16))) w_1[1];
 110 static complex_t __attribute__((aligned(16))) w_2[2];
 111 static complex_t __attribute__((aligned(16))) w_4[4];
 112 static complex_t __attribute__((aligned(16))) w_8[8];
 113 static complex_t __attribute__((aligned(16))) w_16[16];
 114 static complex_t __attribute__((aligned(16))) w_32[32];
 115 static complex_t __attribute__((aligned(16))) w_64[64];
 116 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
 117
 118 /* Twiddle factors for IMDCT */
 119 static sample_t __attribute__((aligned(16))) xcos1[128];
 120 static sample_t __attribute__((aligned(16))) xsin1[128];
 121
 122 #if ARCH_X86 || ARCH_X86_64
 123 // NOTE: SSE needs 16byte alignment or it will segfault
 124 //
 125 static float __attribute__((aligned(16))) sseSinCos1c[256];
 126 static float __attribute__((aligned(16))) sseSinCos1d[256];
 127 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
 128 //static float __attribute__((aligned(16))) sseW0[4];
 129 static float __attribute__((aligned(16))) sseW1[8];
 130 static float __attribute__((aligned(16))) sseW2[16];
 131 static float __attribute__((aligned(16))) sseW3[32];
 132 static float __attribute__((aligned(16))) sseW4[64];
 133 static float __attribute__((aligned(16))) sseW5[128];
 134 static float __attribute__((aligned(16))) sseW6[256];
 135 static float __attribute__((aligned(16))) *sseW[7]=
 136         {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 137 static float __attribute__((aligned(16))) sseWindow[512];
 138 #endif
 139
 140 /* Root values for IFFT */
 141 static sample_t roots16[3];
 142 static sample_t roots32[7];
 143 static sample_t roots64[15];
 144 static sample_t roots128[31];
 145
 146 /* Twiddle factors for IMDCT */
 147 static complex_t pre1[128];
 148 static complex_t post1[64];
 149 static complex_t pre2[64];
 150 static complex_t post2[32];
 151
 152 static sample_t a52_imdct_window[256];
 153
 154 static void (* ifft128) (complex_t * buf);
 155 static void (* ifft64) (complex_t * buf);
 156
 157 static inline void ifft2 (complex_t * buf)
 158 {
 159     double r, i;
 160
 161     r = buf[0].real;
 162     i = buf[0].imag;
 163     buf[0].real += buf[1].real;
 164     buf[0].imag += buf[1].imag;
 165     buf[1].real = r - buf[1].real;
 166     buf[1].imag = i - buf[1].imag;
 167 }
 168
 169 static inline void ifft4 (complex_t * buf)
 170 {
 171     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 172
 173     tmp1 = buf[0].real + buf[1].real;
 174     tmp2 = buf[3].real + buf[2].real;
 175     tmp3 = buf[0].imag + buf[1].imag;
 176     tmp4 = buf[2].imag + buf[3].imag;
 177     tmp5 = buf[0].real - buf[1].real;
 178     tmp6 = buf[0].imag - buf[1].imag;
 179     tmp7 = buf[2].imag - buf[3].imag;
 180     tmp8 = buf[3].real - buf[2].real;
 181
 182     buf[0].real = tmp1 + tmp2;
 183     buf[0].imag = tmp3 + tmp4;
 184     buf[2].real = tmp1 - tmp2;
 185     buf[2].imag = tmp3 - tmp4;
 186     buf[1].real = tmp5 + tmp7;
 187     buf[1].imag = tmp6 + tmp8;
 188     buf[3].real = tmp5 - tmp7;
 189     buf[3].imag = tmp6 - tmp8;
 190 }
 191
 192 /* the basic split-radix ifft butterfly */
 193
 194 #define BUTTERFLY(a0,a1,a2,a3,wr,wi) do {       \
 195     tmp5 = a2.real * wr + a2.imag * wi;         \
 196     tmp6 = a2.imag * wr - a2.real * wi;         \
 197     tmp7 = a3.real * wr - a3.imag * wi;         \
 198     tmp8 = a3.imag * wr + a3.real * wi;         \
 199     tmp1 = tmp5 + tmp7;                         \
 200     tmp2 = tmp6 + tmp8;                         \
 201     tmp3 = tmp6 - tmp8;                         \
 202     tmp4 = tmp7 - tmp5;                         \
 203     a2.real = a0.real - tmp1;                   \
 204     a2.imag = a0.imag - tmp2;                   \
 205     a3.real = a1.real - tmp3;                   \
 206     a3.imag = a1.imag - tmp4;                   \
 207     a0.real += tmp1;                            \
 208     a0.imag += tmp2;                            \
 209     a1.real += tmp3;                            \
 210     a1.imag += tmp4;                            \
 211 } while (0)
 212
 213 /* split-radix ifft butterfly, specialized for wr=1 wi=0 */
 214
 215 #define BUTTERFLY_ZERO(a0,a1,a2,a3) do {        \
 216     tmp1 = a2.real + a3.real;                   \
 217     tmp2 = a2.imag + a3.imag;                   \
 218     tmp3 = a2.imag - a3.imag;                   \
 219     tmp4 = a3.real - a2.real;                   \
 220     a2.real = a0.real - tmp1;                   \
 221     a2.imag = a0.imag - tmp2;                   \
 222     a3.real = a1.real - tmp3;                   \
 223     a3.imag = a1.imag - tmp4;                   \
 224     a0.real += tmp1;                            \
 225     a0.imag += tmp2;                            \
 226     a1.real += tmp3;                            \
 227     a1.imag += tmp4;                            \
 228 } while (0)
 229
 230 /* split-radix ifft butterfly, specialized for wr=wi */
 231
 232 #define BUTTERFLY_HALF(a0,a1,a2,a3,w) do {      \
 233     tmp5 = (a2.real + a2.imag) * w;             \
 234     tmp6 = (a2.imag - a2.real) * w;             \
 235     tmp7 = (a3.real - a3.imag) * w;             \
 236     tmp8 = (a3.imag + a3.real) * w;             \
 237     tmp1 = tmp5 + tmp7;                         \
 238     tmp2 = tmp6 + tmp8;                         \
 239     tmp3 = tmp6 - tmp8;                         \
 240     tmp4 = tmp7 - tmp5;                         \
 241     a2.real = a0.real - tmp1;                   \
 242     a2.imag = a0.imag - tmp2;                   \
 243     a3.real = a1.real - tmp3;                   \
 244     a3.imag = a1.imag - tmp4;                   \
 245     a0.real += tmp1;                            \
 246     a0.imag += tmp2;                            \
 247     a1.real += tmp3;                            \
 248     a1.imag += tmp4;                            \
 249 } while (0)
 250
 251 static inline void ifft8 (complex_t * buf)
 252 {
 253     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 254
 255     ifft4 (buf);
 256     ifft2 (buf + 4);
 257     ifft2 (buf + 6);
 258     BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]);
 259     BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]);
 260 }
 261
 262 static void ifft_pass (complex_t * buf, sample_t * weight, int n)
 263 {
 264     complex_t * buf1;
 265     complex_t * buf2;
 266     complex_t * buf3;
 267     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 268     int i;
 269
 270     buf++;
 271     buf1 = buf + n;
 272     buf2 = buf + 2 * n;
 273     buf3 = buf + 3 * n;
 274
 275     BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]);
 276
 277     i = n - 1;
 278
 279     do {
 280         BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]);
 281         buf++;
 282         buf1++;
 283         buf2++;
 284         buf3++;
 285         weight++;
 286     } while (--i);
 287 }
 288
 289 static void ifft16 (complex_t * buf)
 290 {
 291     ifft8 (buf);
 292     ifft4 (buf + 8);
 293     ifft4 (buf + 12);
 294     ifft_pass (buf, roots16 - 4, 4);
 295 }
 296
 297 static void ifft32 (complex_t * buf)
 298 {
 299     ifft16 (buf);
 300     ifft8 (buf + 16);
 301     ifft8 (buf + 24);
 302     ifft_pass (buf, roots32 - 8, 8);
 303 }
 304
 305 static void ifft64_c (complex_t * buf)
 306 {
 307     ifft32 (buf);
 308     ifft16 (buf + 32);
 309     ifft16 (buf + 48);
 310     ifft_pass (buf, roots64 - 16, 16);
 311 }
 312
 313 static void ifft128_c (complex_t * buf)
 314 {
 315     ifft32 (buf);
 316     ifft16 (buf + 32);
 317     ifft16 (buf + 48);
 318     ifft_pass (buf, roots64 - 16, 16);
 319
 320     ifft32 (buf + 64);
 321     ifft32 (buf + 96);
 322     ifft_pass (buf, roots128 - 32, 32);
 323 }
 324
 325 void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
 326 {
 327     int i, k;
 328     sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
 329     const sample_t * window = a52_imdct_window;
 330     complex_t buf[128];
 331
 332     for (i = 0; i < 128; i++) {
 333         k = fftorder[i];
 334         t_r = pre1[i].real;
 335         t_i = pre1[i].imag;
 336
 337         buf[i].real = t_i * data[255-k] + t_r * data[k];
 338         buf[i].imag = t_r * data[255-k] - t_i * data[k];
 339     }
 340
 341     ifft128 (buf);
 342
 343     /* Post IFFT complex multiply plus IFFT complex conjugate*/
 344     /* Window and convert to real valued signal */
 345     for (i = 0; i < 64; i++) {
 346         /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 347         t_r = post1[i].real;
 348         t_i = post1[i].imag;
 349
 350         a_r = t_r * buf[i].real     + t_i * buf[i].imag;
 351         a_i = t_i * buf[i].real     - t_r * buf[i].imag;
 352         b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag;
 353         b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag;
 354
 355         w_1 = window[2*i];
 356         w_2 = window[255-2*i];
 357         data[2*i]     = delay[2*i] * w_2 - a_r * w_1 + bias;
 358         data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
 359         delay[2*i] = a_i;
 360
 361         w_1 = window[2*i+1];
 362         w_2 = window[254-2*i];
 363         data[2*i+1]   = delay[2*i+1] * w_2 + b_r * w_1 + bias;
 364         data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias;
 365         delay[2*i+1] = b_i;
 366     }
 367 }
 368
 369 #if HAVE_ALTIVEC
 370
 371 #ifdef HAVE_ALTIVEC_H
 372 #include <altivec.h>
 373 #endif
 374
 375 // used to build registers permutation vectors (vcprm)
 376 // the 's' are for words in the _s_econd vector
 377 #define WORD_0 0x00,0x01,0x02,0x03
 378 #define WORD_1 0x04,0x05,0x06,0x07
 379 #define WORD_2 0x08,0x09,0x0a,0x0b
 380 #define WORD_3 0x0c,0x0d,0x0e,0x0f
 381 #define WORD_s0 0x10,0x11,0x12,0x13
 382 #define WORD_s1 0x14,0x15,0x16,0x17
 383 #define WORD_s2 0x18,0x19,0x1a,0x1b
 384 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
 385
 386 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
 387 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 388
 389 #define FOUROF(a) {a,a,a,a}
 390
 391 // vcprmle is used to keep the same index as in the SSE version.
 392 // it's the same as vcprm, with the index inversed
 393 // ('le' is Little Endian)
 394 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
 395
 396 // used to build inverse/identity vectors (vcii)
 397 // n is _n_egative, p is _p_ositive
 398 #define FLOAT_n -1.
 399 #define FLOAT_p 1.
 400
 401
 402 void
 403 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
 404 {
 405   int i;
 406   int k;
 407   int p,q;
 408   int m;
 409   long two_m;
 410   long two_m_plus_one;
 411
 412   sample_t tmp_b_i;
 413   sample_t tmp_b_r;
 414   sample_t tmp_a_i;
 415   sample_t tmp_a_r;
 416
 417   sample_t *data_ptr;
 418   sample_t *delay_ptr;
 419   sample_t *window_ptr;
 420
 421   /* 512 IMDCT with source and dest data in 'data' */
 422
 423   /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 424   for( i=0; i < 128; i++) {
 425     /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 426     int j= bit_reverse_512[i];
 427     buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 428     buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 429   }
 430
 431   /* 1. iteration */
 432   for(i = 0; i < 128; i += 2) {
 433 #if 0
 434     tmp_a_r = buf[i].real;
 435     tmp_a_i = buf[i].imag;
 436     tmp_b_r = buf[i+1].real;
 437     tmp_b_i = buf[i+1].imag;
 438     buf[i].real = tmp_a_r + tmp_b_r;
 439     buf[i].imag =  tmp_a_i + tmp_b_i;
 440     buf[i+1].real = tmp_a_r - tmp_b_r;
 441     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 442 #else
 443     vector float temp, bufv;
 444
 445     bufv = vec_ld(i << 3, (float*)buf);
 446     temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
 447     bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
 448     vec_st(bufv, i << 3, (float*)buf);
 449 #endif
 450   }
 451
 452   /* 2. iteration */
 453   // Note w[1]={{1,0}, {0,-1}}
 454   for(i = 0; i < 128; i += 4) {
 455 #if 0
 456     tmp_a_r = buf[i].real;
 457     tmp_a_i = buf[i].imag;
 458     tmp_b_r = buf[i+2].real;
 459     tmp_b_i = buf[i+2].imag;
 460     buf[i].real = tmp_a_r + tmp_b_r;
 461     buf[i].imag =  tmp_a_i + tmp_b_i;
 462     buf[i+2].real = tmp_a_r - tmp_b_r;
 463     buf[i+2].imag =  tmp_a_i - tmp_b_i;
 464     tmp_a_r = buf[i+1].real;
 465     tmp_a_i = buf[i+1].imag;
 466     /* WARNING: im <-> re here ! */
 467     tmp_b_r = buf[i+3].imag;
 468     tmp_b_i = buf[i+3].real;
 469     buf[i+1].real = tmp_a_r + tmp_b_r;
 470     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 471     buf[i+3].real = tmp_a_r - tmp_b_r;
 472     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 473 #else
 474     vector float buf01, buf23, temp1, temp2;
 475
 476     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 477     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 478     buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
 479
 480     temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
 481     temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
 482
 483     vec_st(temp1, (i + 0) << 3, (float*)buf);
 484     vec_st(temp2, (i + 2) << 3, (float*)buf);
 485 #endif
 486   }
 487
 488   /* 3. iteration */
 489   for(i = 0; i < 128; i += 8) {
 490 #if 0
 491     tmp_a_r = buf[i].real;
 492     tmp_a_i = buf[i].imag;
 493     tmp_b_r = buf[i+4].real;
 494     tmp_b_i = buf[i+4].imag;
 495     buf[i].real = tmp_a_r + tmp_b_r;
 496     buf[i].imag =  tmp_a_i + tmp_b_i;
 497     buf[i+4].real = tmp_a_r - tmp_b_r;
 498     buf[i+4].imag =  tmp_a_i - tmp_b_i;
 499     tmp_a_r = buf[1+i].real;
 500     tmp_a_i = buf[1+i].imag;
 501     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 502     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 503     buf[1+i].real = tmp_a_r + tmp_b_r;
 504     buf[1+i].imag =  tmp_a_i + tmp_b_i;
 505     buf[i+5].real = tmp_a_r - tmp_b_r;
 506     buf[i+5].imag =  tmp_a_i - tmp_b_i;
 507     tmp_a_r = buf[i+2].real;
 508     tmp_a_i = buf[i+2].imag;
 509     /* WARNING re <-> im & sign */
 510     tmp_b_r = buf[i+6].imag;
 511     tmp_b_i = - buf[i+6].real;
 512     buf[i+2].real = tmp_a_r + tmp_b_r;
 513     buf[i+2].imag =  tmp_a_i + tmp_b_i;
 514     buf[i+6].real = tmp_a_r - tmp_b_r;
 515     buf[i+6].imag =  tmp_a_i - tmp_b_i;
 516     tmp_a_r = buf[i+3].real;
 517     tmp_a_i = buf[i+3].imag;
 518     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 519     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 520     buf[i+3].real = tmp_a_r + tmp_b_r;
 521     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 522     buf[i+7].real = tmp_a_r - tmp_b_r;
 523     buf[i+7].imag =  tmp_a_i - tmp_b_i;
 524 #else
 525     vector float buf01, buf23, buf45, buf67;
 526
 527     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 528     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 529
 530     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 531     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 532     buf[i+5].real = tmp_b_r;
 533     buf[i+5].imag = tmp_b_i;
 534     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 535     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 536     buf[i+7].real = tmp_b_r;
 537     buf[i+7].imag = tmp_b_i;
 538
 539     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 540     buf45 = vec_ld((i + 4) << 3, (float*)buf);
 541     buf67 = vec_ld((i + 6) << 3, (float*)buf);
 542     buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
 543
 544     vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
 545     vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
 546     vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
 547     vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
 548 #endif
 549   }
 550
 551   /* 4-7. iterations */
 552   for (m=3; m < 7; m++) {
 553     two_m = (1 << m);
 554
 555     two_m_plus_one = two_m<<1;
 556
 557     for(i = 0; i < 128; i += two_m_plus_one) {
 558       for(k = 0; k < two_m; k+=2) {
 559 #if 0
 560         int p = k + i;
 561         int q = p + two_m;
 562         tmp_a_r = buf[p].real;
 563         tmp_a_i = buf[p].imag;
 564         tmp_b_r =
 565           buf[q].real * w[m][k].real -
 566           buf[q].imag * w[m][k].imag;
 567         tmp_b_i =
 568           buf[q].imag * w[m][k].real +
 569           buf[q].real * w[m][k].imag;
 570         buf[p].real = tmp_a_r + tmp_b_r;
 571         buf[p].imag =  tmp_a_i + tmp_b_i;
 572         buf[q].real = tmp_a_r - tmp_b_r;
 573         buf[q].imag =  tmp_a_i - tmp_b_i;
 574
 575         tmp_a_r = buf[(p + 1)].real;
 576         tmp_a_i = buf[(p + 1)].imag;
 577         tmp_b_r =
 578           buf[(q + 1)].real * w[m][(k + 1)].real -
 579           buf[(q + 1)].imag * w[m][(k + 1)].imag;
 580         tmp_b_i =
 581           buf[(q + 1)].imag * w[m][(k + 1)].real +
 582           buf[(q + 1)].real * w[m][(k + 1)].imag;
 583         buf[(p + 1)].real = tmp_a_r + tmp_b_r;
 584         buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
 585         buf[(q + 1)].real = tmp_a_r - tmp_b_r;
 586         buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
 587 #else
 588         int p = k + i;
 589         int q = p + two_m;
 590         vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
 591         const vector float vczero = (const vector float)FOUROF(0.);
 592         // first compute buf[q] and buf[q+1]
 593         vecq = vec_ld(q << 3, (float*)buf);
 594         vecw = vec_ld(0, (float*)&(w[m][k]));
 595         temp1 = vec_madd(vecq, vecw, vczero);
 596         temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
 597         temp2 = vec_madd(temp2, vecw, vczero);
 598         temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
 599         temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
 600         vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
 601         // then butterfly with buf[p] and buf[p+1]
 602         vecp = vec_ld(p << 3, (float*)buf);
 603
 604         temp1 = vec_add(vecp, vecq);
 605         temp2 = vec_sub(vecp, vecq);
 606
 607         vec_st(temp1, p << 3, (float*)buf);
 608         vec_st(temp2, q << 3, (float*)buf);
 609 #endif
 610       }
 611     }
 612   }
 613
 614   /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 615   for( i=0; i < 128; i+=4) {
 616     /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 617 #if 0
 618     tmp_a_r =        buf[(i + 0)].real;
 619     tmp_a_i = -1.0 * buf[(i + 0)].imag;
 620     buf[(i + 0)].real =
 621       (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
 622     buf[(i + 0)].imag =
 623       (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
 624
 625     tmp_a_r =        buf[(i + 1)].real;
 626     tmp_a_i = -1.0 * buf[(i + 1)].imag;
 627     buf[(i + 1)].real =
 628       (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
 629     buf[(i + 1)].imag =
 630       (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
 631
 632     tmp_a_r =        buf[(i + 2)].real;
 633     tmp_a_i = -1.0 * buf[(i + 2)].imag;
 634     buf[(i + 2)].real =
 635       (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
 636     buf[(i + 2)].imag =
 637       (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
 638
 639     tmp_a_r =        buf[(i + 3)].real;
 640     tmp_a_i = -1.0 * buf[(i + 3)].imag;
 641     buf[(i + 3)].real =
 642       (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
 643     buf[(i + 3)].imag =
 644       (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
 645 #else
 646     vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
 647     vector float temp0022, temp1133, tempCS01;
 648     const vector float vczero = (const vector float)FOUROF(0.);
 649
 650     bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
 651     bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
 652
 653     cosv = vec_ld(i << 2, xcos1);
 654     sinv = vec_ld(i << 2, xsin1);
 655
 656     temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
 657     temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
 658     tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
 659     temp1 = vec_madd(temp0022, tempCS01, vczero);
 660     tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
 661     temp2 = vec_madd(temp1133, tempCS01, vczero);
 662     bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 663
 664     vec_st(bufv_0, (i + 0) << 3, (float*)buf);
 665
 666     /* idem with bufv_2 and high-order cosv/sinv */
 667
 668     temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
 669     temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
 670     tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
 671     temp1 = vec_madd(temp0022, tempCS01, vczero);
 672     tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
 673     temp2 = vec_madd(temp1133, tempCS01, vczero);
 674     bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 675
 676     vec_st(bufv_2, (i + 2) << 3, (float*)buf);
 677
 678 #endif
 679   }
 680
 681   data_ptr = data;
 682   delay_ptr = delay;
 683   window_ptr = a52_imdct_window;
 684
 685   /* Window and convert to real valued signal */
 686   for(i=0; i< 64; i++) {
 687     *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 688     *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 689   }
 690
 691   for(i=0; i< 64; i++) {
 692     *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 693     *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 694   }
 695
 696   /* The trailing edge of the window goes into the delay line */
 697   delay_ptr = delay;
 698
 699   for(i=0; i< 64; i++) {
 700     *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 701     *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 702   }
 703
 704   for(i=0; i<64; i++) {
 705     *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 706     *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 707   }
 708 }
 709 #endif
 710
 711
 712 // Stuff below this line is borrowed from libac3
 713 #include "srfftp.h"
 714 #if ARCH_X86 || ARCH_X86_64
 715 #undef HAVE_AMD3DNOW
 716 #define HAVE_AMD3DNOW 1
 717 #include "srfftp_3dnow.h"
 718
 719 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
 720 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
 721 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
 722
 723 #undef HAVE_AMD3DNOWEXT
 724 #define HAVE_AMD3DNOWEXT 0
 725 #include "imdct_3dnow.h"
 726 #undef HAVE_AMD3DNOWEXT
 727 #define HAVE_AMD3DNOWEXT 1
 728 #include "imdct_3dnow.h"
 729
 730 void
 731 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 732 {
 733 /*      int i,k;
 734     int p,q;*/
 735     int m;
 736     long two_m;
 737     long two_m_plus_one;
 738     long two_m_plus_one_shl3;
 739     complex_t *buf_offset;
 740
 741 /*  sample_t tmp_a_i;
 742     sample_t tmp_a_r;
 743     sample_t tmp_b_i;
 744     sample_t tmp_b_r;*/
 745
 746     sample_t *data_ptr;
 747     sample_t *delay_ptr;
 748     sample_t *window_ptr;
 749
 750     /* 512 IMDCT with source and dest data in 'data' */
 751     /* see the c version (dct_do_512()), its allmost identical, just in C */
 752
 753     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 754     /* Bit reversed shuffling */
 755         __asm__ volatile(
 756                 "xor %%"REG_S", %%"REG_S"               \n\t"
 757                 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
 758                 "mov $1008, %%"REG_D"                   \n\t"
 759                 "push %%"REG_BP"                        \n\t" //use ebp without telling gcc
 760                 ASMALIGN(4)
 761                 "1:                                     \n\t"
 762                 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
 763                 "movhps 8(%0, %%"REG_D"), %%xmm0        \n\t" // RXXI
 764                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // XXXi
 765                 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
 766                 "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
 767                 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
 768                 "mulps %%xmm0, %%xmm2                   \n\t"
 769                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
 770                 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 771                 "subps %%xmm0, %%xmm2                   \n\t"
 772                 "movzb (%%"REG_a"), %%"REG_d"           \n\t"
 773                 "movzb 1(%%"REG_a"), %%"REG_BP"         \n\t"
 774                 "movlps %%xmm2, (%1, %%"REG_d", 8)      \n\t"
 775                 "movhps %%xmm2, (%1, %%"REG_BP", 8)     \n\t"
 776                 "add $16, %%"REG_S"                     \n\t"
 777                 "add $2, %%"REG_a"                      \n\t" // avoid complex addressing for P4 crap
 778                 "sub $16, %%"REG_D"                     \n\t"
 779                 "jnc 1b                                 \n\t"
 780                 "pop %%"REG_BP"                         \n\t"//no we didnt touch ebp *g*
 781                 :: "b" (data), "c" (buf)
 782                 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
 783         );
 784
 785
 786     /* FFT Merge */
 787 /* unoptimized variant
 788     for (m=1; m < 7; m++) {
 789         if(m)
 790             two_m = (1 << m);
 791         else
 792             two_m = 1;
 793
 794         two_m_plus_one = (1 << (m+1));
 795
 796         for(i = 0; i < 128; i += two_m_plus_one) {
 797             for(k = 0; k < two_m; k++) {
 798                 p = k + i;
 799                 q = p + two_m;
 800                 tmp_a_r = buf[p].real;
 801                 tmp_a_i = buf[p].imag;
 802                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 803                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 804                 buf[p].real = tmp_a_r + tmp_b_r;
 805                 buf[p].imag =  tmp_a_i + tmp_b_i;
 806                 buf[q].real = tmp_a_r - tmp_b_r;
 807                 buf[q].imag =  tmp_a_i - tmp_b_i;
 808             }
 809         }
 810     }
 811 */
 812
 813     /* 1. iteration */
 814         // Note w[0][0]={1,0}
 815         __asm__ volatile(
 816                 "xorps %%xmm1, %%xmm1   \n\t"
 817                 "xorps %%xmm2, %%xmm2   \n\t"
 818                 "mov %0, %%"REG_S"      \n\t"
 819                 ASMALIGN(4)
 820                 "1:                     \n\t"
 821                 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
 822                 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
 823                 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
 824                 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
 825                 "addps %%xmm1, %%xmm0   \n\t"
 826                 "subps %%xmm2, %%xmm0   \n\t"
 827                 "movaps %%xmm0, (%%"REG_S")\n\t"
 828                 "add $16, %%"REG_S"     \n\t"
 829                 "cmp %1, %%"REG_S"      \n\t"
 830                 " jb 1b                 \n\t"
 831                 :: "g" (buf), "r" (buf + 128)
 832                 : "%"REG_S
 833         );
 834
 835     /* 2. iteration */
 836         // Note w[1]={{1,0}, {0,-1}}
 837         __asm__ volatile(
 838                 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
 839                 "mov %0, %%"REG_S"              \n\t"
 840                 ASMALIGN(4)
 841                 "1:                             \n\t"
 842                 "movaps 16(%%"REG_S"), %%xmm2   \n\t" //r2,i2,r3,i3
 843                 "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
 844                 "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
 845                 "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
 846                 "movaps (%%"REG_S"), %%xmm1     \n\t" //r0,i0,r1,i1
 847                 "addps %%xmm2, %%xmm0           \n\t"
 848                 "subps %%xmm2, %%xmm1           \n\t"
 849                 "movaps %%xmm0, (%%"REG_S")     \n\t"
 850                 "movaps %%xmm1, 16(%%"REG_S")   \n\t"
 851                 "add $32, %%"REG_S"     \n\t"
 852                 "cmp %1, %%"REG_S"      \n\t"
 853                 " jb 1b                 \n\t"
 854                 :: "g" (buf), "r" (buf + 128)
 855                 : "%"REG_S
 856         );
 857
 858     /* 3. iteration */
 859 /*
 860  Note sseW2+0={1,1,sqrt(2),sqrt(2))
 861  Note sseW2+16={0,0,sqrt(2),-sqrt(2))
 862  Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
 863  Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
 864 */
 865         __asm__ volatile(
 866                 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
 867                 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
 868                 "xorps %%xmm5, %%xmm5           \n\t"
 869                 "xorps %%xmm2, %%xmm2           \n\t"
 870                 "mov %0, %%"REG_S"              \n\t"
 871                 ASMALIGN(4)
 872                 "1:                             \n\t"
 873                 "movaps 32(%%"REG_S"), %%xmm2   \n\t" //r4,i4,r5,i5
 874                 "movaps 48(%%"REG_S"), %%xmm3   \n\t" //r6,i6,r7,i7
 875                 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
 876                 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
 877                 "mulps %%xmm2, %%xmm4           \n\t"
 878                 "mulps %%xmm3, %%xmm5           \n\t"
 879                 "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
 880                 "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
 881                 "mulps %%xmm6, %%xmm3           \n\t"
 882                 "mulps %%xmm7, %%xmm2           \n\t"
 883                 "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
 884                 "movaps 16(%%"REG_S"), %%xmm1   \n\t" //r2,i2,r3,i3
 885                 "addps %%xmm4, %%xmm2           \n\t"
 886                 "addps %%xmm5, %%xmm3           \n\t"
 887                 "movaps %%xmm2, %%xmm4          \n\t"
 888                 "movaps %%xmm3, %%xmm5          \n\t"
 889                 "addps %%xmm0, %%xmm2           \n\t"
 890                 "addps %%xmm1, %%xmm3           \n\t"
 891                 "subps %%xmm4, %%xmm0           \n\t"
 892                 "subps %%xmm5, %%xmm1           \n\t"
 893                 "movaps %%xmm2, (%%"REG_S")     \n\t"
 894                 "movaps %%xmm3, 16(%%"REG_S")   \n\t"
 895                 "movaps %%xmm0, 32(%%"REG_S")   \n\t"
 896                 "movaps %%xmm1, 48(%%"REG_S")   \n\t"
 897                 "add $64, %%"REG_S"     \n\t"
 898                 "cmp %1, %%"REG_S"      \n\t"
 899                 " jb 1b                 \n\t"
 900                 :: "g" (buf), "r" (buf + 128)
 901                 : "%"REG_S
 902         );
 903
 904     /* 4-7. iterations */
 905     for (m=3; m < 7; m++) {
 906         two_m = (1 << m);
 907         two_m_plus_one = two_m<<1;
 908         two_m_plus_one_shl3 = (two_m_plus_one<<3);
 909         buf_offset = buf+128;
 910         __asm__ volatile(
 911                 "mov %0, %%"REG_S"                      \n\t"
 912                 ASMALIGN(4)
 913                 "1:                                     \n\t"
 914                 "xor %%"REG_D", %%"REG_D"               \n\t" // k
 915                 "lea (%%"REG_S", %3), %%"REG_d"         \n\t"
 916                 "2:                                     \n\t"
 917                 "movaps (%%"REG_d", %%"REG_D"), %%xmm1  \n\t"
 918                 "movaps (%4, %%"REG_D", 2), %%xmm2      \n\t"
 919                 "mulps %%xmm1, %%xmm2                   \n\t"
 920                 "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
 921                 "mulps 16(%4, %%"REG_D", 2), %%xmm1     \n\t"
 922                 "movaps (%%"REG_S", %%"REG_D"), %%xmm0  \n\t"
 923                 "addps %%xmm2, %%xmm1                   \n\t"
 924                 "movaps %%xmm1, %%xmm2                  \n\t"
 925                 "addps %%xmm0, %%xmm1                   \n\t"
 926                 "subps %%xmm2, %%xmm0                   \n\t"
 927                 "movaps %%xmm1, (%%"REG_S", %%"REG_D")  \n\t"
 928                 "movaps %%xmm0, (%%"REG_d", %%"REG_D")  \n\t"
 929                 "add $16, %%"REG_D"                     \n\t"
 930                 "cmp %3, %%"REG_D"                      \n\t" //FIXME (opt) count against 0
 931                 "jb 2b                                  \n\t"
 932                 "add %2, %%"REG_S"                      \n\t"
 933                 "cmp %1, %%"REG_S"                      \n\t"
 934                 " jb 1b                                 \n\t"
 935                 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
 936                    "r" (sseW[m])
 937                 : "%"REG_S, "%"REG_D, "%"REG_d
 938         );
 939     }
 940
 941     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 942         __asm__ volatile(
 943                 "mov $-1024, %%"REG_S"                  \n\t"
 944                 ASMALIGN(4)
 945                 "1:                                     \n\t"
 946                 "movaps (%0, %%"REG_S"), %%xmm0         \n\t"
 947                 "movaps (%0, %%"REG_S"), %%xmm1         \n\t"
 948                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
 949                 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
 950                 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 951                 "addps %%xmm1, %%xmm0                   \n\t"
 952                 "movaps %%xmm0, (%0, %%"REG_S")         \n\t"
 953                 "add $16, %%"REG_S"                     \n\t"
 954                 " jnz 1b                                \n\t"
 955                 :: "r" (buf+128)
 956                 : "%"REG_S
 957         );
 958
 959
 960     data_ptr = data;
 961     delay_ptr = delay;
 962     window_ptr = a52_imdct_window;
 963
 964     /* Window and convert to real valued signal */
 965         __asm__ volatile(
 966                 "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
 967                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
 968                 "movss %3, %%xmm2                       \n\t"  // bias
 969                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
 970                 ASMALIGN(4)
 971                 "1:                                     \n\t"
 972                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? A ?
 973                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
 974                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
 975                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
 976                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
 977                 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
 978                 "addps (%2, %%"REG_S"), %%xmm0          \n\t"
 979                 "addps %%xmm2, %%xmm0                   \n\t"
 980                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
 981                 "add  $16, %%"REG_S"                    \n\t"
 982                 "sub  $16, %%"REG_D"                    \n\t"
 983                 "cmp  $512, %%"REG_S"                   \n\t"
 984                 " jb 1b                                 \n\t"
 985                 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
 986                 : "%"REG_S, "%"REG_D
 987         );
 988         data_ptr+=128;
 989         delay_ptr+=128;
 990 //      window_ptr+=128;
 991
 992         __asm__ volatile(
 993                 "mov $1024, %%"REG_D"                   \n\t"  // 512
 994                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
 995                 "movss %3, %%xmm2                       \n\t"  // bias
 996                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
 997                 ASMALIGN(4)
 998                 "1:                                     \n\t"
 999                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
1000                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
1001                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
1002                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
1003                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1004                 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1005                 "addps (%2, %%"REG_S"), %%xmm0          \n\t"
1006                 "addps %%xmm2, %%xmm0                   \n\t"
1007                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1008                 "add $16, %%"REG_S"                     \n\t"
1009                 "sub $16, %%"REG_D"                     \n\t"
1010                 "cmp $512, %%"REG_S"                    \n\t"
1011                 " jb 1b                                 \n\t"
1012                 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1013                 : "%"REG_S, "%"REG_D
1014         );
1015         data_ptr+=128;
1016 //      window_ptr+=128;
1017
1018     /* The trailing edge of the window goes into the delay line */
1019     delay_ptr = delay;
1020
1021         __asm__ volatile(
1022                 "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
1023                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1024                 ASMALIGN(4)
1025                 "1:                                     \n\t"
1026                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
1027                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
1028                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
1029                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
1030                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1031                 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1032                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1033                 "add $16, %%"REG_S"                     \n\t"
1034                 "sub $16, %%"REG_D"                     \n\t"
1035                 "cmp $512, %%"REG_S"                    \n\t"
1036                 " jb 1b                                 \n\t"
1037                 :: "r" (buf+64), "r" (delay_ptr)
1038                 : "%"REG_S, "%"REG_D
1039         );
1040         delay_ptr+=128;
1041 //      window_ptr-=128;
1042
1043         __asm__ volatile(
1044                 "mov $1024, %%"REG_D"                   \n\t"  // 1024
1045                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1046                 ASMALIGN(4)
1047                 "1:                                     \n\t"
1048                 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1049                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
1050                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
1051                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
1052                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1053                 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1054                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1055                 "add $16, %%"REG_S"                     \n\t"
1056                 "sub $16, %%"REG_D"                     \n\t"
1057                 "cmp $512, %%"REG_S"                    \n\t"
1058                 " jb 1b                                 \n\t"
1059                 :: "r" (buf), "r" (delay_ptr)
1060                 : "%"REG_S, "%"REG_D
1061         );
1062 }
1063 #endif // ARCH_X86 || ARCH_X86_64
1064
1065 void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
1066 {
1067     int i, k;
1068     sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
1069     const sample_t * window = a52_imdct_window;
1070     complex_t buf1[64], buf2[64];
1071
1072     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1073     for (i = 0; i < 64; i++) {
1074         k = fftorder[i];
1075         t_r = pre2[i].real;
1076         t_i = pre2[i].imag;
1077
1078         buf1[i].real = t_i * data[254-k] + t_r * data[k];
1079         buf1[i].imag = t_r * data[254-k] - t_i * data[k];
1080
1081         buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
1082         buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
1083     }
1084
1085     ifft64 (buf1);
1086     ifft64 (buf2);
1087
1088     /* Post IFFT complex multiply */
1089     /* Window and convert to real valued signal */
1090     for (i = 0; i < 32; i++) {
1091         /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1092         t_r = post2[i].real;
1093         t_i = post2[i].imag;
1094
1095         a_r = t_r * buf1[i].real    + t_i * buf1[i].imag;
1096         a_i = t_i * buf1[i].real    - t_r * buf1[i].imag;
1097         b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
1098         b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
1099
1100         c_r = t_r * buf2[i].real    + t_i * buf2[i].imag;
1101         c_i = t_i * buf2[i].real    - t_r * buf2[i].imag;
1102         d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
1103         d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
1104
1105         w_1 = window[2*i];
1106         w_2 = window[255-2*i];
1107         data[2*i]     = delay[2*i] * w_2 - a_r * w_1 + bias;
1108         data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
1109         delay[2*i] = c_i;
1110
1111         w_1 = window[128+2*i];
1112         w_2 = window[127-2*i];
1113         data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
1114         data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
1115         delay[127-2*i] = c_r;
1116
1117         w_1 = window[2*i+1];
1118         w_2 = window[254-2*i];
1119         data[2*i+1]   = delay[2*i+1] * w_2 - b_i * w_1 + bias;
1120         data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
1121         delay[2*i+1] = d_r;
1122
1123         w_1 = window[129+2*i];
1124         w_2 = window[126-2*i];
1125         data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
1126         data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
1127         delay[126-2*i] = d_i;
1128     }
1129 }
1130
1131 static double besselI0 (double x)
1132 {
1133     double bessel = 1;
1134     int i = 100;
1135
1136     do
1137         bessel = bessel * x / (i * i) + 1;
1138     while (--i);
1139     return bessel;
1140 }
1141
1142 void a52_imdct_init (uint32_t mm_accel)
1143 {
1144     int i, j, k;
1145     double sum;
1146
1147     /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
1148     sum = 0;
1149     for (i = 0; i < 256; i++) {
1150         sum += besselI0 (i * (256 - i) * (5 * M_PI / 256) * (5 * M_PI / 256));
1151         a52_imdct_window[i] = sum;
1152     }
1153     sum++;
1154     for (i = 0; i < 256; i++)
1155         a52_imdct_window[i] = sqrt (a52_imdct_window[i] / sum);
1156
1157     for (i = 0; i < 3; i++)
1158         roots16[i] = cos ((M_PI / 8) * (i + 1));
1159
1160     for (i = 0; i < 7; i++)
1161         roots32[i] = cos ((M_PI / 16) * (i + 1));
1162
1163     for (i = 0; i < 15; i++)
1164         roots64[i] = cos ((M_PI / 32) * (i + 1));
1165
1166     for (i = 0; i < 31; i++)
1167         roots128[i] = cos ((M_PI / 64) * (i + 1));
1168
1169     for (i = 0; i < 64; i++) {
1170         k = fftorder[i] / 2 + 64;
1171         pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
1172         pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
1173     }
1174
1175     for (i = 64; i < 128; i++) {
1176         k = fftorder[i] / 2 + 64;
1177         pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
1178         pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
1179     }
1180
1181     for (i = 0; i < 64; i++) {
1182         post1[i].real = cos ((M_PI / 256) * (i + 0.5));
1183         post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
1184     }
1185
1186     for (i = 0; i < 64; i++) {
1187         k = fftorder[i] / 4;
1188         pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
1189         pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
1190     }
1191
1192     for (i = 0; i < 32; i++) {
1193         post2[i].real = cos ((M_PI / 128) * (i + 0.5));
1194         post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
1195     }
1196     for (i = 0; i < 128; i++) {
1197         xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1198         xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1199     }
1200     for (i = 0; i < 7; i++) {
1201         j = 1 << i;
1202         for (k = 0; k < j; k++) {
1203             w[i][k].real = cos (-M_PI * k / j);
1204             w[i][k].imag = sin (-M_PI * k / j);
1205         }
1206     }
1207 #if ARCH_X86 || ARCH_X86_64
1208         for (i = 0; i < 128; i++) {
1209             sseSinCos1c[2*i+0]= xcos1[i];
1210             sseSinCos1c[2*i+1]= -xcos1[i];
1211             sseSinCos1d[2*i+0]= xsin1[i];
1212             sseSinCos1d[2*i+1]= xsin1[i];
1213         }
1214         for (i = 1; i < 7; i++) {
1215             j = 1 << i;
1216             for (k = 0; k < j; k+=2) {
1217
1218                 sseW[i][4*k + 0] = w[i][k+0].real;
1219                 sseW[i][4*k + 1] = w[i][k+0].real;
1220                 sseW[i][4*k + 2] = w[i][k+1].real;
1221                 sseW[i][4*k + 3] = w[i][k+1].real;
1222
1223                 sseW[i][4*k + 4] = -w[i][k+0].imag;
1224                 sseW[i][4*k + 5] = w[i][k+0].imag;
1225                 sseW[i][4*k + 6] = -w[i][k+1].imag;
1226                 sseW[i][4*k + 7] = w[i][k+1].imag;
1227
1228         //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1229                 if(k==0)
1230                 {
1231 //                      sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1232                         sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1233                 }
1234
1235                 if(2*k == j)
1236                 {
1237                         sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1238 //                      sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1239                 }
1240             }
1241         }
1242
1243         for(i=0; i<128; i++)
1244         {
1245                 sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
1246                 sseWindow[2*i+1]=  a52_imdct_window[2*i+1];
1247         }
1248
1249         for(i=0; i<64; i++)
1250         {
1251                 sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
1252                 sseWindow[256 + 2*i+1]=  a52_imdct_window[254 - 2*i+0];
1253                 sseWindow[384 + 2*i+0]=  a52_imdct_window[126 - 2*i+1];
1254                 sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
1255         }
1256 #endif
1257         a52_imdct_512 = imdct_do_512;
1258         ifft128 = ifft128_c;
1259         ifft64 = ifft64_c;
1260
1261 #if ARCH_X86 || ARCH_X86_64
1262         if(mm_accel & MM_ACCEL_X86_SSE)
1263         {
1264           fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1265           a52_imdct_512 = imdct_do_512_sse;
1266         }
1267         else
1268         if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1269         {
1270           fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1271           a52_imdct_512 = imdct_do_512_3dnowex;
1272         }
1273         else
1274         if(mm_accel & MM_ACCEL_X86_3DNOW)
1275         {
1276           fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1277           a52_imdct_512 = imdct_do_512_3dnow;
1278         }
1279         else
1280 #endif // ARCH_X86 || ARCH_X86_64
1281 #if HAVE_ALTIVEC
1282         if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1283         {
1284           fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1285           a52_imdct_512 = imdct_do_512_altivec;
1286         }
1287         else
1288 #endif
1289
1290 #ifdef LIBA52_DJBFFT
1291     if (mm_accel & MM_ACCEL_DJBFFT) {
1292         fprintf (stderr, "Using djbfft for IMDCT transform\n");
1293         ifft128 = (void (*) (complex_t *)) fftc4_un128;
1294         ifft64 = (void (*) (complex_t *)) fftc4_un64;
1295     } else
1296 #endif
1297     {
1298         fprintf (stderr, "No accelerated IMDCT transform found\n");
1299     }
1300 }