liba52/imdct.c

   1 /*
   2  * imdct.c
   3  * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
   4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   5  *
   6  * This file is part of a52dec, a free ATSC A-52 stream decoder.
   7  * See http://liba52.sourceforge.net/ for updates.
   8  *
   9  * Modified for use with MPlayer, changes contained in liba52_changes.diff.
  10  * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
  11  * $Id$
  12  *
  13  * a52dec is free software; you can redistribute it and/or modify
  14  * it under the terms of the GNU General Public License as published by
  15  * the Free Software Foundation; either version 2 of the License, or
  16  * (at your option) any later version.
  17  *
  18  * a52dec is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU General Public License
  24  * along with this program; if not, write to the Free Software
  25  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  26  *
  27  * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
  28  * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
  29  *   michael did port them from libac3 (untested, perhaps totally broken)
  30  * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
  31  */
  32
  33 #include "config.h"
  34
  35 #include <math.h>
  36 #include <stdio.h>
  37 #ifndef M_PI
  38 #define M_PI 3.1415926535897932384626433832795029
  39 #endif
  40 #include <inttypes.h>
  41
  42 #include "a52.h"
  43 #include "a52_internal.h"
  44 #include "mm_accel.h"
  45 #include "mangle.h"
  46
  47 #ifdef RUNTIME_CPUDETECT
  48 #undef HAVE_3DNOWEX
  49 #endif
  50
  51 #define USE_AC3_C
  52
  53 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
  54 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
  55
  56 typedef struct complex_s {
  57     sample_t real;
  58     sample_t imag;
  59 } complex_t;
  60
  61 static void fft_128p(complex_t *a);
  62
  63 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
  64 {
  65         0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
  66         4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
  67         2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
  68         6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
  69         1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
  70         5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
  71         3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
  72         7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
  73 };
  74
  75 /* 128 point bit-reverse LUT */
  76 static uint8_t attribute_used bit_reverse_512[] = {
  77         0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
  78         0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
  79         0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
  80         0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
  81         0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
  82         0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
  83         0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
  84         0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
  85         0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
  86         0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
  87         0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
  88         0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
  89         0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
  90         0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
  91         0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
  92         0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
  93
  94 static uint8_t bit_reverse_256[] = {
  95         0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
  96         0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
  97         0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
  98         0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
  99         0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
 100         0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
 101         0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
 102         0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
 103
 104 #ifdef ARCH_X86
 105 // NOTE: SSE needs 16byte alignment or it will segfault
 106 //
 107 static complex_t __attribute__((aligned(16))) buf[128];
 108 static float __attribute__((aligned(16))) sseSinCos1c[256];
 109 static float __attribute__((aligned(16))) sseSinCos1d[256];
 110 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
 111 //static float __attribute__((aligned(16))) sseW0[4];
 112 static float __attribute__((aligned(16))) sseW1[8];
 113 static float __attribute__((aligned(16))) sseW2[16];
 114 static float __attribute__((aligned(16))) sseW3[32];
 115 static float __attribute__((aligned(16))) sseW4[64];
 116 static float __attribute__((aligned(16))) sseW5[128];
 117 static float __attribute__((aligned(16))) sseW6[256];
 118 static float __attribute__((aligned(16))) *sseW[7]=
 119         {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 120 static float __attribute__((aligned(16))) sseWindow[512];
 121 #else
 122 static complex_t  __attribute__((aligned(16))) buf[128];
 123 #endif
 124
 125 /* Twiddle factor LUT */
 126 static complex_t __attribute__((aligned(16))) w_1[1];
 127 static complex_t __attribute__((aligned(16))) w_2[2];
 128 static complex_t __attribute__((aligned(16))) w_4[4];
 129 static complex_t __attribute__((aligned(16))) w_8[8];
 130 static complex_t __attribute__((aligned(16))) w_16[16];
 131 static complex_t __attribute__((aligned(16))) w_32[32];
 132 static complex_t __attribute__((aligned(16))) w_64[64];
 133 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
 134
 135 /* Twiddle factors for IMDCT */
 136 static sample_t __attribute__((aligned(16))) xcos1[128];
 137 static sample_t __attribute__((aligned(16))) xsin1[128];
 138 static sample_t __attribute__((aligned(16))) xcos2[64];
 139 static sample_t __attribute__((aligned(16))) xsin2[64];
 140
 141 /* Windowing function for Modified DCT - Thank you acroread */
 142 sample_t imdct_window[] = {
 143         0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
 144         0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
 145         0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
 146         0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
 147         0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
 148         0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
 149         0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
 150         0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
 151         0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
 152         0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
 153         0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
 154         0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
 155         0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
 156         0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
 157         0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
 158         0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
 159         0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
 160         0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
 161         0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
 162         0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
 163         0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
 164         0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
 165         0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
 166         0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
 167         0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
 168         0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
 169         0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
 170         0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
 171         0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
 172         0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
 173         0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
 174         1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
 175
 176
 177 static inline void swap_cmplx(complex_t *a, complex_t *b)
 178 {
 179     complex_t tmp;
 180
 181     tmp = *a;
 182     *a = *b;
 183     *b = tmp;
 184 }
 185
 186
 187
 188 static inline complex_t cmplx_mult(complex_t a, complex_t b)
 189 {
 190     complex_t ret;
 191
 192     ret.real = a.real * b.real - a.imag * b.imag;
 193     ret.imag = a.real * b.imag + a.imag * b.real;
 194
 195     return ret;
 196 }
 197
 198 void
 199 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
 200 {
 201     int i;
 202 #ifndef USE_AC3_C
 203         int k;
 204     int p,q;
 205     int m;
 206     int two_m;
 207     int two_m_plus_one;
 208
 209     sample_t tmp_b_i;
 210     sample_t tmp_b_r;
 211 #endif
 212     sample_t tmp_a_i;
 213     sample_t tmp_a_r;
 214
 215     sample_t *data_ptr;
 216     sample_t *delay_ptr;
 217     sample_t *window_ptr;
 218
 219     /* 512 IMDCT with source and dest data in 'data' */
 220
 221     /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 222     for( i=0; i < 128; i++) {
 223         /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 224 #ifdef USE_AC3_C
 225         int j= pm128[i];
 226 #else
 227         int j= bit_reverse_512[i];
 228 #endif
 229         buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 230         buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 231     }
 232
 233     /* FFT Merge */
 234 /* unoptimized variant
 235     for (m=1; m < 7; m++) {
 236         if(m)
 237             two_m = (1 << m);
 238         else
 239             two_m = 1;
 240
 241         two_m_plus_one = (1 << (m+1));
 242
 243         for(i = 0; i < 128; i += two_m_plus_one) {
 244             for(k = 0; k < two_m; k++) {
 245                 p = k + i;
 246                 q = p + two_m;
 247                 tmp_a_r = buf[p].real;
 248                 tmp_a_i = buf[p].imag;
 249                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 250                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 251                 buf[p].real = tmp_a_r + tmp_b_r;
 252                 buf[p].imag =  tmp_a_i + tmp_b_i;
 253                 buf[q].real = tmp_a_r - tmp_b_r;
 254                 buf[q].imag =  tmp_a_i - tmp_b_i;
 255             }
 256         }
 257     }
 258 */
 259 #ifdef USE_AC3_C
 260         fft_128p (&buf[0]);
 261 #else
 262
 263     /* 1. iteration */
 264     for(i = 0; i < 128; i += 2) {
 265         tmp_a_r = buf[i].real;
 266         tmp_a_i = buf[i].imag;
 267         tmp_b_r = buf[i+1].real;
 268         tmp_b_i = buf[i+1].imag;
 269         buf[i].real = tmp_a_r + tmp_b_r;
 270         buf[i].imag =  tmp_a_i + tmp_b_i;
 271         buf[i+1].real = tmp_a_r - tmp_b_r;
 272         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 273     }
 274
 275     /* 2. iteration */
 276         // Note w[1]={{1,0}, {0,-1}}
 277     for(i = 0; i < 128; i += 4) {
 278         tmp_a_r = buf[i].real;
 279         tmp_a_i = buf[i].imag;
 280         tmp_b_r = buf[i+2].real;
 281         tmp_b_i = buf[i+2].imag;
 282         buf[i].real = tmp_a_r + tmp_b_r;
 283         buf[i].imag =  tmp_a_i + tmp_b_i;
 284         buf[i+2].real = tmp_a_r - tmp_b_r;
 285         buf[i+2].imag =  tmp_a_i - tmp_b_i;
 286         tmp_a_r = buf[i+1].real;
 287         tmp_a_i = buf[i+1].imag;
 288         tmp_b_r = buf[i+3].imag;
 289         tmp_b_i = buf[i+3].real;
 290         buf[i+1].real = tmp_a_r + tmp_b_r;
 291         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 292         buf[i+3].real = tmp_a_r - tmp_b_r;
 293         buf[i+3].imag =  tmp_a_i + tmp_b_i;
 294     }
 295
 296     /* 3. iteration */
 297     for(i = 0; i < 128; i += 8) {
 298                 tmp_a_r = buf[i].real;
 299                 tmp_a_i = buf[i].imag;
 300                 tmp_b_r = buf[i+4].real;
 301                 tmp_b_i = buf[i+4].imag;
 302                 buf[i].real = tmp_a_r + tmp_b_r;
 303                 buf[i].imag =  tmp_a_i + tmp_b_i;
 304                 buf[i+4].real = tmp_a_r - tmp_b_r;
 305                 buf[i+4].imag =  tmp_a_i - tmp_b_i;
 306                 tmp_a_r = buf[1+i].real;
 307                 tmp_a_i = buf[1+i].imag;
 308                 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 309                 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 310                 buf[1+i].real = tmp_a_r + tmp_b_r;
 311                 buf[1+i].imag =  tmp_a_i + tmp_b_i;
 312                 buf[i+5].real = tmp_a_r - tmp_b_r;
 313                 buf[i+5].imag =  tmp_a_i - tmp_b_i;
 314                 tmp_a_r = buf[i+2].real;
 315                 tmp_a_i = buf[i+2].imag;
 316                 tmp_b_r = buf[i+6].imag;
 317                 tmp_b_i = - buf[i+6].real;
 318                 buf[i+2].real = tmp_a_r + tmp_b_r;
 319                 buf[i+2].imag =  tmp_a_i + tmp_b_i;
 320                 buf[i+6].real = tmp_a_r - tmp_b_r;
 321                 buf[i+6].imag =  tmp_a_i - tmp_b_i;
 322                 tmp_a_r = buf[i+3].real;
 323                 tmp_a_i = buf[i+3].imag;
 324                 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 325                 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 326                 buf[i+3].real = tmp_a_r + tmp_b_r;
 327                 buf[i+3].imag =  tmp_a_i + tmp_b_i;
 328                 buf[i+7].real = tmp_a_r - tmp_b_r;
 329                 buf[i+7].imag =  tmp_a_i - tmp_b_i;
 330      }
 331
 332     /* 4-7. iterations */
 333     for (m=3; m < 7; m++) {
 334         two_m = (1 << m);
 335
 336         two_m_plus_one = two_m<<1;
 337
 338         for(i = 0; i < 128; i += two_m_plus_one) {
 339             for(k = 0; k < two_m; k++) {
 340                 int p = k + i;
 341                 int q = p + two_m;
 342                 tmp_a_r = buf[p].real;
 343                 tmp_a_i = buf[p].imag;
 344                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 345                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 346                 buf[p].real = tmp_a_r + tmp_b_r;
 347                 buf[p].imag =  tmp_a_i + tmp_b_i;
 348                 buf[q].real = tmp_a_r - tmp_b_r;
 349                 buf[q].imag =  tmp_a_i - tmp_b_i;
 350             }
 351         }
 352     }
 353 #endif
 354     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 355     for( i=0; i < 128; i++) {
 356         /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 357         tmp_a_r =        buf[i].real;
 358         tmp_a_i = -1.0 * buf[i].imag;
 359         buf[i].real =(tmp_a_r * xcos1[i])  -  (tmp_a_i  * xsin1[i]);
 360         buf[i].imag =(tmp_a_r * xsin1[i])  +  (tmp_a_i  * xcos1[i]);
 361     }
 362
 363     data_ptr = data;
 364     delay_ptr = delay;
 365     window_ptr = imdct_window;
 366
 367     /* Window and convert to real valued signal */
 368     for(i=0; i< 64; i++) {
 369         *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 370         *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 371     }
 372
 373     for(i=0; i< 64; i++) {
 374         *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 375         *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 376     }
 377
 378     /* The trailing edge of the window goes into the delay line */
 379     delay_ptr = delay;
 380
 381     for(i=0; i< 64; i++) {
 382         *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 383         *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 384     }
 385
 386     for(i=0; i<64; i++) {
 387         *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 388         *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 389     }
 390 }
 391
 392 #ifdef HAVE_ALTIVEC
 393
 394 #ifndef SYS_DARWIN
 395 #include <altivec.h>
 396 #endif
 397
 398 // used to build registers permutation vectors (vcprm)
 399 // the 's' are for words in the _s_econd vector
 400 #define WORD_0 0x00,0x01,0x02,0x03
 401 #define WORD_1 0x04,0x05,0x06,0x07
 402 #define WORD_2 0x08,0x09,0x0a,0x0b
 403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
 404 #define WORD_s0 0x10,0x11,0x12,0x13
 405 #define WORD_s1 0x14,0x15,0x16,0x17
 406 #define WORD_s2 0x18,0x19,0x1a,0x1b
 407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
 408
 409 #ifdef SYS_DARWIN
 410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
 411 #else
 412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
 413 #endif
 414
 415 // vcprmle is used to keep the same index as in the SSE version.
 416 // it's the same as vcprm, with the index inversed
 417 // ('le' is Little Endian)
 418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
 419
 420 // used to build inverse/identity vectors (vcii)
 421 // n is _n_egative, p is _p_ositive
 422 #define FLOAT_n -1.
 423 #define FLOAT_p 1.
 424
 425 #ifdef SYS_DARWIN
 426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
 427 #else
 428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 429 #endif
 430
 431 #ifdef SYS_DARWIN
 432 #define FOUROF(a) (a)
 433 #else
 434 #define FOUROF(a) {a,a,a,a}
 435 #endif
 436
 437
 438 void
 439 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
 440 {
 441   int i;
 442   int k;
 443   int p,q;
 444   int m;
 445   int two_m;
 446   int two_m_plus_one;
 447
 448   sample_t tmp_b_i;
 449   sample_t tmp_b_r;
 450   sample_t tmp_a_i;
 451   sample_t tmp_a_r;
 452
 453   sample_t *data_ptr;
 454   sample_t *delay_ptr;
 455   sample_t *window_ptr;
 456
 457   /* 512 IMDCT with source and dest data in 'data' */
 458
 459   /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 460   for( i=0; i < 128; i++) {
 461     /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 462     int j= bit_reverse_512[i];
 463     buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 464     buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 465   }
 466
 467   /* 1. iteration */
 468   for(i = 0; i < 128; i += 2) {
 469 #if 0
 470     tmp_a_r = buf[i].real;
 471     tmp_a_i = buf[i].imag;
 472     tmp_b_r = buf[i+1].real;
 473     tmp_b_i = buf[i+1].imag;
 474     buf[i].real = tmp_a_r + tmp_b_r;
 475     buf[i].imag =  tmp_a_i + tmp_b_i;
 476     buf[i+1].real = tmp_a_r - tmp_b_r;
 477     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 478 #else
 479     vector float temp, bufv;
 480
 481     bufv = vec_ld(i << 3, (float*)buf);
 482     temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
 483     bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
 484     vec_st(bufv, i << 3, (float*)buf);
 485 #endif
 486   }
 487
 488   /* 2. iteration */
 489   // Note w[1]={{1,0}, {0,-1}}
 490   for(i = 0; i < 128; i += 4) {
 491 #if 0
 492     tmp_a_r = buf[i].real;
 493     tmp_a_i = buf[i].imag;
 494     tmp_b_r = buf[i+2].real;
 495     tmp_b_i = buf[i+2].imag;
 496     buf[i].real = tmp_a_r + tmp_b_r;
 497     buf[i].imag =  tmp_a_i + tmp_b_i;
 498     buf[i+2].real = tmp_a_r - tmp_b_r;
 499     buf[i+2].imag =  tmp_a_i - tmp_b_i;
 500     tmp_a_r = buf[i+1].real;
 501     tmp_a_i = buf[i+1].imag;
 502     /* WARNING: im <-> re here ! */
 503     tmp_b_r = buf[i+3].imag;
 504     tmp_b_i = buf[i+3].real;
 505     buf[i+1].real = tmp_a_r + tmp_b_r;
 506     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 507     buf[i+3].real = tmp_a_r - tmp_b_r;
 508     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 509 #else
 510     vector float buf01, buf23, temp1, temp2;
 511
 512     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 513     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 514     buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
 515
 516     temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
 517     temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
 518
 519     vec_st(temp1, (i + 0) << 3, (float*)buf);
 520     vec_st(temp2, (i + 2) << 3, (float*)buf);
 521 #endif
 522   }
 523
 524   /* 3. iteration */
 525   for(i = 0; i < 128; i += 8) {
 526 #if 0
 527     tmp_a_r = buf[i].real;
 528     tmp_a_i = buf[i].imag;
 529     tmp_b_r = buf[i+4].real;
 530     tmp_b_i = buf[i+4].imag;
 531     buf[i].real = tmp_a_r + tmp_b_r;
 532     buf[i].imag =  tmp_a_i + tmp_b_i;
 533     buf[i+4].real = tmp_a_r - tmp_b_r;
 534     buf[i+4].imag =  tmp_a_i - tmp_b_i;
 535     tmp_a_r = buf[1+i].real;
 536     tmp_a_i = buf[1+i].imag;
 537     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 538     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 539     buf[1+i].real = tmp_a_r + tmp_b_r;
 540     buf[1+i].imag =  tmp_a_i + tmp_b_i;
 541     buf[i+5].real = tmp_a_r - tmp_b_r;
 542     buf[i+5].imag =  tmp_a_i - tmp_b_i;
 543     tmp_a_r = buf[i+2].real;
 544     tmp_a_i = buf[i+2].imag;
 545     /* WARNING re <-> im & sign */
 546     tmp_b_r = buf[i+6].imag;
 547     tmp_b_i = - buf[i+6].real;
 548     buf[i+2].real = tmp_a_r + tmp_b_r;
 549     buf[i+2].imag =  tmp_a_i + tmp_b_i;
 550     buf[i+6].real = tmp_a_r - tmp_b_r;
 551     buf[i+6].imag =  tmp_a_i - tmp_b_i;
 552     tmp_a_r = buf[i+3].real;
 553     tmp_a_i = buf[i+3].imag;
 554     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 555     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 556     buf[i+3].real = tmp_a_r + tmp_b_r;
 557     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 558     buf[i+7].real = tmp_a_r - tmp_b_r;
 559     buf[i+7].imag =  tmp_a_i - tmp_b_i;
 560 #else
 561     vector float buf01, buf23, buf45, buf67;
 562
 563     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 564     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 565
 566     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 567     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 568     buf[i+5].real = tmp_b_r;
 569     buf[i+5].imag = tmp_b_i;
 570     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 571     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 572     buf[i+7].real = tmp_b_r;
 573     buf[i+7].imag = tmp_b_i;
 574
 575     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 576     buf45 = vec_ld((i + 4) << 3, (float*)buf);
 577     buf67 = vec_ld((i + 6) << 3, (float*)buf);
 578     buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
 579
 580     vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
 581     vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
 582     vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
 583     vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
 584 #endif
 585   }
 586
 587   /* 4-7. iterations */
 588   for (m=3; m < 7; m++) {
 589     two_m = (1 << m);
 590
 591     two_m_plus_one = two_m<<1;
 592
 593     for(i = 0; i < 128; i += two_m_plus_one) {
 594       for(k = 0; k < two_m; k+=2) {
 595 #if 0
 596         int p = k + i;
 597         int q = p + two_m;
 598         tmp_a_r = buf[p].real;
 599         tmp_a_i = buf[p].imag;
 600         tmp_b_r =
 601           buf[q].real * w[m][k].real -
 602           buf[q].imag * w[m][k].imag;
 603         tmp_b_i =
 604           buf[q].imag * w[m][k].real +
 605           buf[q].real * w[m][k].imag;
 606         buf[p].real = tmp_a_r + tmp_b_r;
 607         buf[p].imag =  tmp_a_i + tmp_b_i;
 608         buf[q].real = tmp_a_r - tmp_b_r;
 609         buf[q].imag =  tmp_a_i - tmp_b_i;
 610
 611         tmp_a_r = buf[(p + 1)].real;
 612         tmp_a_i = buf[(p + 1)].imag;
 613         tmp_b_r =
 614           buf[(q + 1)].real * w[m][(k + 1)].real -
 615           buf[(q + 1)].imag * w[m][(k + 1)].imag;
 616         tmp_b_i =
 617           buf[(q + 1)].imag * w[m][(k + 1)].real +
 618           buf[(q + 1)].real * w[m][(k + 1)].imag;
 619         buf[(p + 1)].real = tmp_a_r + tmp_b_r;
 620         buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
 621         buf[(q + 1)].real = tmp_a_r - tmp_b_r;
 622         buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
 623 #else
 624         int p = k + i;
 625         int q = p + two_m;
 626         vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
 627         const vector float vczero = (const vector float)FOUROF(0.);
 628         // first compute buf[q] and buf[q+1]
 629         vecq = vec_ld(q << 3, (float*)buf);
 630         vecw = vec_ld(0, (float*)&(w[m][k]));
 631         temp1 = vec_madd(vecq, vecw, vczero);
 632         temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
 633         temp2 = vec_madd(temp2, vecw, vczero);
 634         temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
 635         temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
 636         vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
 637         // then butterfly with buf[p] and buf[p+1]
 638         vecp = vec_ld(p << 3, (float*)buf);
 639
 640         temp1 = vec_add(vecp, vecq);
 641         temp2 = vec_sub(vecp, vecq);
 642
 643         vec_st(temp1, p << 3, (float*)buf);
 644         vec_st(temp2, q << 3, (float*)buf);
 645 #endif
 646       }
 647     }
 648   }
 649
 650   /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 651   for( i=0; i < 128; i+=4) {
 652     /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 653 #if 0
 654     tmp_a_r =        buf[(i + 0)].real;
 655     tmp_a_i = -1.0 * buf[(i + 0)].imag;
 656     buf[(i + 0)].real =
 657       (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
 658     buf[(i + 0)].imag =
 659       (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
 660
 661     tmp_a_r =        buf[(i + 1)].real;
 662     tmp_a_i = -1.0 * buf[(i + 1)].imag;
 663     buf[(i + 1)].real =
 664       (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
 665     buf[(i + 1)].imag =
 666       (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
 667
 668     tmp_a_r =        buf[(i + 2)].real;
 669     tmp_a_i = -1.0 * buf[(i + 2)].imag;
 670     buf[(i + 2)].real =
 671       (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
 672     buf[(i + 2)].imag =
 673       (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
 674
 675     tmp_a_r =        buf[(i + 3)].real;
 676     tmp_a_i = -1.0 * buf[(i + 3)].imag;
 677     buf[(i + 3)].real =
 678       (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
 679     buf[(i + 3)].imag =
 680       (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
 681 #else
 682     vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
 683     vector float temp0022, temp1133, tempCS01;
 684     const vector float vczero = (const vector float)FOUROF(0.);
 685
 686     bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
 687     bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
 688
 689     cosv = vec_ld(i << 2, xcos1);
 690     sinv = vec_ld(i << 2, xsin1);
 691
 692     temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
 693     temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
 694     tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
 695     temp1 = vec_madd(temp0022, tempCS01, vczero);
 696     tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
 697     temp2 = vec_madd(temp1133, tempCS01, vczero);
 698     bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 699
 700     vec_st(bufv_0, (i + 0) << 3, (float*)buf);
 701
 702     /* idem with bufv_2 and high-order cosv/sinv */
 703
 704     temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
 705     temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
 706     tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
 707     temp1 = vec_madd(temp0022, tempCS01, vczero);
 708     tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
 709     temp2 = vec_madd(temp1133, tempCS01, vczero);
 710     bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 711
 712     vec_st(bufv_2, (i + 2) << 3, (float*)buf);
 713
 714 #endif
 715   }
 716
 717   data_ptr = data;
 718   delay_ptr = delay;
 719   window_ptr = imdct_window;
 720
 721   /* Window and convert to real valued signal */
 722   for(i=0; i< 64; i++) {
 723     *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 724     *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 725   }
 726
 727   for(i=0; i< 64; i++) {
 728     *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 729     *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 730   }
 731
 732   /* The trailing edge of the window goes into the delay line */
 733   delay_ptr = delay;
 734
 735   for(i=0; i< 64; i++) {
 736     *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 737     *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 738   }
 739
 740   for(i=0; i<64; i++) {
 741     *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 742     *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 743   }
 744 }
 745 #endif
 746
 747
 748 // Stuff below this line is borrowed from libac3
 749 #include "srfftp.h"
 750 #ifdef ARCH_X86
 751 #ifndef HAVE_3DNOW
 752 #define HAVE_3DNOW 1
 753 #endif
 754 #include "srfftp_3dnow.h"
 755
 756 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
 757 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
 758 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
 759
 760 #undef HAVE_3DNOWEX
 761 #include "imdct_3dnow.h"
 762 #define HAVE_3DNOWEX
 763 #include "imdct_3dnow.h"
 764
 765 void
 766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 767 {
 768 /*      int i,k;
 769     int p,q;*/
 770     int m;
 771     int two_m;
 772     int two_m_plus_one;
 773
 774 /*  sample_t tmp_a_i;
 775     sample_t tmp_a_r;
 776     sample_t tmp_b_i;
 777     sample_t tmp_b_r;*/
 778
 779     sample_t *data_ptr;
 780     sample_t *delay_ptr;
 781     sample_t *window_ptr;
 782
 783     /* 512 IMDCT with source and dest data in 'data' */
 784     /* see the c version (dct_do_512()), its allmost identical, just in C */
 785
 786     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 787     /* Bit reversed shuffling */
 788         asm volatile(
 789                 "xorl %%esi, %%esi                      \n\t"
 790                 "leal "MANGLE(bit_reverse_512)", %%eax  \n\t"
 791                 "movl $1008, %%edi                      \n\t"
 792                 "pushl %%ebp                            \n\t" //use ebp without telling gcc
 793                 ".balign 16                             \n\t"
 794                 "1:                                     \n\t"
 795                 "movlps (%0, %%esi), %%xmm0             \n\t" // XXXI
 796                 "movhps 8(%0, %%edi), %%xmm0            \n\t" // RXXI
 797                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // XXXi
 798                 "movhps (%0, %%edi), %%xmm1             \n\t" // rXXi
 799                 "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
 800                 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
 801                 "mulps %%xmm0, %%xmm2                   \n\t"
 802                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
 803                 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
 804                 "subps %%xmm0, %%xmm2                   \n\t"
 805                 "movzbl (%%eax), %%edx                  \n\t"
 806                 "movzbl 1(%%eax), %%ebp                 \n\t"
 807                 "movlps %%xmm2, (%1, %%edx,8)           \n\t"
 808                 "movhps %%xmm2, (%1, %%ebp,8)           \n\t"
 809                 "addl $16, %%esi                        \n\t"
 810                 "addl $2, %%eax                         \n\t" // avoid complex addressing for P4 crap
 811                 "subl $16, %%edi                        \n\t"
 812                 " jnc 1b                                \n\t"
 813                 "popl %%ebp                             \n\t"//no we didnt touch ebp *g*
 814                 :: "b" (data), "c" (buf)
 815                 : "%esi", "%edi", "%eax", "%edx"
 816         );
 817
 818
 819     /* FFT Merge */
 820 /* unoptimized variant
 821     for (m=1; m < 7; m++) {
 822         if(m)
 823             two_m = (1 << m);
 824         else
 825             two_m = 1;
 826
 827         two_m_plus_one = (1 << (m+1));
 828
 829         for(i = 0; i < 128; i += two_m_plus_one) {
 830             for(k = 0; k < two_m; k++) {
 831                 p = k + i;
 832                 q = p + two_m;
 833                 tmp_a_r = buf[p].real;
 834                 tmp_a_i = buf[p].imag;
 835                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 836                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 837                 buf[p].real = tmp_a_r + tmp_b_r;
 838                 buf[p].imag =  tmp_a_i + tmp_b_i;
 839                 buf[q].real = tmp_a_r - tmp_b_r;
 840                 buf[q].imag =  tmp_a_i - tmp_b_i;
 841             }
 842         }
 843     }
 844 */
 845
 846     /* 1. iteration */
 847         // Note w[0][0]={1,0}
 848         asm volatile(
 849                 "xorps %%xmm1, %%xmm1   \n\t"
 850                 "xorps %%xmm2, %%xmm2   \n\t"
 851                 "movl %0, %%esi         \n\t"
 852                 ".balign 16                             \n\t"
 853                 "1:                     \n\t"
 854                 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
 855                 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
 856                 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
 857                 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
 858                 "addps %%xmm1, %%xmm0   \n\t"
 859                 "subps %%xmm2, %%xmm0   \n\t"
 860                 "movaps %%xmm0, (%%esi) \n\t"
 861                 "addl $16, %%esi        \n\t"
 862                 "cmpl %1, %%esi         \n\t"
 863                 " jb 1b                 \n\t"
 864                 :: "g" (buf), "r" (buf + 128)
 865                 : "%esi"
 866         );
 867
 868     /* 2. iteration */
 869         // Note w[1]={{1,0}, {0,-1}}
 870         asm volatile(
 871                 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
 872                 "movl %0, %%esi                 \n\t"
 873                 ".balign 16                             \n\t"
 874                 "1:                             \n\t"
 875                 "movaps 16(%%esi), %%xmm2       \n\t" //r2,i2,r3,i3
 876                 "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
 877                 "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
 878                 "movaps (%%esi), %%xmm0         \n\t" //r0,i0,r1,i1
 879                 "movaps (%%esi), %%xmm1         \n\t" //r0,i0,r1,i1
 880                 "addps %%xmm2, %%xmm0           \n\t"
 881                 "subps %%xmm2, %%xmm1           \n\t"
 882                 "movaps %%xmm0, (%%esi)         \n\t"
 883                 "movaps %%xmm1, 16(%%esi)       \n\t"
 884                 "addl $32, %%esi        \n\t"
 885                 "cmpl %1, %%esi         \n\t"
 886                 " jb 1b                 \n\t"
 887                 :: "g" (buf), "r" (buf + 128)
 888                 : "%esi"
 889         );
 890
 891     /* 3. iteration */
 892 /*
 893  Note sseW2+0={1,1,sqrt(2),sqrt(2))
 894  Note sseW2+16={0,0,sqrt(2),-sqrt(2))
 895  Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
 896  Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
 897 */
 898         asm volatile(
 899                 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
 900                 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
 901                 "xorps %%xmm5, %%xmm5           \n\t"
 902                 "xorps %%xmm2, %%xmm2           \n\t"
 903                 "movl %0, %%esi                 \n\t"
 904                 ".balign 16                     \n\t"
 905                 "1:                             \n\t"
 906                 "movaps 32(%%esi), %%xmm2       \n\t" //r4,i4,r5,i5
 907                 "movaps 48(%%esi), %%xmm3       \n\t" //r6,i6,r7,i7
 908                 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
 909                 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
 910                 "mulps %%xmm2, %%xmm4           \n\t"
 911                 "mulps %%xmm3, %%xmm5           \n\t"
 912                 "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
 913                 "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
 914                 "mulps %%xmm6, %%xmm3           \n\t"
 915                 "mulps %%xmm7, %%xmm2           \n\t"
 916                 "movaps (%%esi), %%xmm0         \n\t" //r0,i0,r1,i1
 917                 "movaps 16(%%esi), %%xmm1       \n\t" //r2,i2,r3,i3
 918                 "addps %%xmm4, %%xmm2           \n\t"
 919                 "addps %%xmm5, %%xmm3           \n\t"
 920                 "movaps %%xmm2, %%xmm4          \n\t"
 921                 "movaps %%xmm3, %%xmm5          \n\t"
 922                 "addps %%xmm0, %%xmm2           \n\t"
 923                 "addps %%xmm1, %%xmm3           \n\t"
 924                 "subps %%xmm4, %%xmm0           \n\t"
 925                 "subps %%xmm5, %%xmm1           \n\t"
 926                 "movaps %%xmm2, (%%esi)         \n\t"
 927                 "movaps %%xmm3, 16(%%esi)       \n\t"
 928                 "movaps %%xmm0, 32(%%esi)       \n\t"
 929                 "movaps %%xmm1, 48(%%esi)       \n\t"
 930                 "addl $64, %%esi        \n\t"
 931                 "cmpl %1, %%esi         \n\t"
 932                 " jb 1b                 \n\t"
 933                 :: "g" (buf), "r" (buf + 128)
 934                 : "%esi"
 935         );
 936
 937     /* 4-7. iterations */
 938     for (m=3; m < 7; m++) {
 939         two_m = (1 << m);
 940         two_m_plus_one = two_m<<1;
 941         asm volatile(
 942                 "movl %0, %%esi                         \n\t"
 943                 ".balign 16                             \n\t"
 944                 "1:                                     \n\t"
 945                 "xorl %%edi, %%edi                      \n\t" // k
 946                 "leal (%%esi, %3), %%edx                \n\t"
 947                 "2:                                     \n\t"
 948                 "movaps (%%edx, %%edi), %%xmm1          \n\t"
 949                 "movaps (%4, %%edi, 2), %%xmm2          \n\t"
 950                 "mulps %%xmm1, %%xmm2                   \n\t"
 951                 "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
 952                 "mulps 16(%4, %%edi, 2), %%xmm1         \n\t"
 953                 "movaps (%%esi, %%edi), %%xmm0          \n\t"
 954                 "addps %%xmm2, %%xmm1                   \n\t"
 955                 "movaps %%xmm1, %%xmm2                  \n\t"
 956                 "addps %%xmm0, %%xmm1                   \n\t"
 957                 "subps %%xmm2, %%xmm0                   \n\t"
 958                 "movaps %%xmm1, (%%esi, %%edi)          \n\t"
 959                 "movaps %%xmm0, (%%edx, %%edi)          \n\t"
 960                 "addl $16, %%edi                        \n\t"
 961                 "cmpl %3, %%edi                         \n\t" //FIXME (opt) count against 0
 962                 " jb 2b                                 \n\t"
 963                 "addl %2, %%esi                         \n\t"
 964                 "cmpl %1, %%esi                         \n\t"
 965                 " jb 1b                                 \n\t"
 966                 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
 967                    "r" (sseW[m])
 968                 : "%esi", "%edi", "%edx"
 969         );
 970     }
 971
 972     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 973         asm volatile(
 974                 "movl $-1024, %%esi                     \n\t"
 975                 ".balign 16                             \n\t"
 976                 "1:                                     \n\t"
 977                 "movaps (%0, %%esi), %%xmm0             \n\t"
 978                 "movaps (%0, %%esi), %%xmm1             \n\t"
 979                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
 980                 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
 981                 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
 982                 "addps %%xmm1, %%xmm0                   \n\t"
 983                 "movaps %%xmm0, (%0, %%esi)             \n\t"
 984                 "addl $16, %%esi                        \n\t"
 985                 " jnz 1b                                \n\t"
 986                 :: "r" (buf+128)
 987                 : "%esi"
 988         );
 989
 990
 991     data_ptr = data;
 992     delay_ptr = delay;
 993     window_ptr = imdct_window;
 994
 995     /* Window and convert to real valued signal */
 996         asm volatile(
 997                 "xorl %%edi, %%edi                      \n\t"  // 0
 998                 "xorl %%esi, %%esi                      \n\t"  // 0
 999                 "movss %3, %%xmm2                       \n\t"  // bias
1000                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
1001                 ".balign 16                             \n\t"
1002                 "1:                                     \n\t"
1003                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? A ?
1004                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? C ?
1005                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // ? D C ?
1006                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // ? B A ?
1007                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1008                 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1009                 "addps (%2, %%esi), %%xmm0              \n\t"
1010                 "addps %%xmm2, %%xmm0                   \n\t"
1011                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1012                 "addl $16, %%esi                        \n\t"
1013                 "subl $16, %%edi                        \n\t"
1014                 "cmpl $512, %%esi                       \n\t"
1015                 " jb 1b                                 \n\t"
1016                 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1017                 : "%esi", "%edi"
1018         );
1019         data_ptr+=128;
1020         delay_ptr+=128;
1021 //      window_ptr+=128;
1022
1023         asm volatile(
1024                 "movl $1024, %%edi                      \n\t"  // 512
1025                 "xorl %%esi, %%esi                      \n\t"  // 0
1026                 "movss %3, %%xmm2                       \n\t"  // bias
1027                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
1028                 ".balign 16                             \n\t"
1029                 "1:                                     \n\t"
1030                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? ? A
1031                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? ? C
1032                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // D ? ? C
1033                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // B ? ? A
1034                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1035                 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1036                 "addps (%2, %%esi), %%xmm0              \n\t"
1037                 "addps %%xmm2, %%xmm0                   \n\t"
1038                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1039                 "addl $16, %%esi                        \n\t"
1040                 "subl $16, %%edi                        \n\t"
1041                 "cmpl $512, %%esi                       \n\t"
1042                 " jb 1b                                 \n\t"
1043                 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1044                 : "%esi", "%edi"
1045         );
1046         data_ptr+=128;
1047 //      window_ptr+=128;
1048
1049     /* The trailing edge of the window goes into the delay line */
1050     delay_ptr = delay;
1051
1052         asm volatile(
1053                 "xorl %%edi, %%edi                      \n\t"  // 0
1054                 "xorl %%esi, %%esi                      \n\t"  // 0
1055                 ".balign 16                             \n\t"
1056                 "1:                                     \n\t"
1057                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? ? A
1058                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? ? C
1059                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // D ? ? C
1060                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // B ? ? A
1061                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1062                 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1063                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1064                 "addl $16, %%esi                        \n\t"
1065                 "subl $16, %%edi                        \n\t"
1066                 "cmpl $512, %%esi                       \n\t"
1067                 " jb 1b                                 \n\t"
1068                 :: "r" (buf+64), "r" (delay_ptr)
1069                 : "%esi", "%edi"
1070         );
1071         delay_ptr+=128;
1072 //      window_ptr-=128;
1073
1074         asm volatile(
1075                 "movl $1024, %%edi                      \n\t"  // 1024
1076                 "xorl %%esi, %%esi                      \n\t"  // 0
1077                 ".balign 16                             \n\t"
1078                 "1:                                     \n\t"
1079                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? A ?
1080                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? C ?
1081                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // ? D C ?
1082                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // ? B A ?
1083                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1084                 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1085                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1086                 "addl $16, %%esi                        \n\t"
1087                 "subl $16, %%edi                        \n\t"
1088                 "cmpl $512, %%esi                       \n\t"
1089                 " jb 1b                                 \n\t"
1090                 :: "r" (buf), "r" (delay_ptr)
1091                 : "%esi", "%edi"
1092         );
1093 }
1094 #endif //arch_x86
1095
1096 void
1097 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1098 {
1099     int i,k;
1100     int p,q;
1101     int m;
1102     int two_m;
1103     int two_m_plus_one;
1104
1105     sample_t tmp_a_i;
1106     sample_t tmp_a_r;
1107     sample_t tmp_b_i;
1108     sample_t tmp_b_r;
1109
1110     sample_t *data_ptr;
1111     sample_t *delay_ptr;
1112     sample_t *window_ptr;
1113
1114     complex_t *buf_1, *buf_2;
1115
1116     buf_1 = &buf[0];
1117     buf_2 = &buf[64];
1118
1119     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1120     for(k=0; k<64; k++) {
1121         /* X1[k] = X[2*k]  */
1122         /* X2[k] = X[2*k+1]     */
1123
1124         p = 2 * (128-2*k-1);
1125         q = 2 * (2 * k);
1126
1127         /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1128         buf_1[k].real =         data[p] * xcos2[k] - data[q] * xsin2[k];
1129         buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1130         /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1131         buf_2[k].real =          data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1132         buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1133     }
1134
1135     /* IFFT Bit reversed shuffling */
1136     for(i=0; i<64; i++) {
1137         k = bit_reverse_256[i];
1138         if (k < i) {
1139             swap_cmplx(&buf_1[i],&buf_1[k]);
1140             swap_cmplx(&buf_2[i],&buf_2[k]);
1141         }
1142     }
1143
1144     /* FFT Merge */
1145     for (m=0; m < 6; m++) {
1146         two_m = (1 << m);
1147         two_m_plus_one = (1 << (m+1));
1148
1149         /* FIXME */
1150         if(m)
1151             two_m = (1 << m);
1152         else
1153             two_m = 1;
1154
1155         for(k = 0; k < two_m; k++) {
1156             for(i = 0; i < 64; i += two_m_plus_one) {
1157                 p = k + i;
1158                 q = p + two_m;
1159                 /* Do block 1 */
1160                 tmp_a_r = buf_1[p].real;
1161                 tmp_a_i = buf_1[p].imag;
1162                 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1163                 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1164                 buf_1[p].real = tmp_a_r + tmp_b_r;
1165                 buf_1[p].imag =  tmp_a_i + tmp_b_i;
1166                 buf_1[q].real = tmp_a_r - tmp_b_r;
1167                 buf_1[q].imag =  tmp_a_i - tmp_b_i;
1168
1169                 /* Do block 2 */
1170                 tmp_a_r = buf_2[p].real;
1171                 tmp_a_i = buf_2[p].imag;
1172                 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1173                 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1174                 buf_2[p].real = tmp_a_r + tmp_b_r;
1175                 buf_2[p].imag =  tmp_a_i + tmp_b_i;
1176                 buf_2[q].real = tmp_a_r - tmp_b_r;
1177                 buf_2[q].imag =  tmp_a_i - tmp_b_i;
1178             }
1179         }
1180     }
1181
1182     /* Post IFFT complex multiply */
1183     for( i=0; i < 64; i++) {
1184         /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1185         tmp_a_r =  buf_1[i].real;
1186         tmp_a_i = -buf_1[i].imag;
1187         buf_1[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1188         buf_1[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1189         /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1190         tmp_a_r =  buf_2[i].real;
1191         tmp_a_i = -buf_2[i].imag;
1192         buf_2[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1193         buf_2[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1194     }
1195
1196     data_ptr = data;
1197     delay_ptr = delay;
1198     window_ptr = imdct_window;
1199
1200     /* Window and convert to real valued signal */
1201     for(i=0; i< 64; i++) {
1202         *data_ptr++  = -buf_1[i].imag      * *window_ptr++ + *delay_ptr++ + bias;
1203         *data_ptr++  =  buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1204     }
1205
1206     for(i=0; i< 64; i++) {
1207         *data_ptr++  = -buf_1[i].real      * *window_ptr++ + *delay_ptr++ + bias;
1208         *data_ptr++  =  buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1209     }
1210
1211     delay_ptr = delay;
1212
1213     for(i=0; i< 64; i++) {
1214         *delay_ptr++ = -buf_2[i].real      * *--window_ptr;
1215         *delay_ptr++ =  buf_2[64-i-1].imag * *--window_ptr;
1216     }
1217
1218     for(i=0; i< 64; i++) {
1219         *delay_ptr++ =  buf_2[i].imag      * *--window_ptr;
1220         *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1221     }
1222 }
1223
1224 void imdct_init (uint32_t mm_accel)
1225 {
1226 #ifdef LIBA52_MLIB
1227     if (mm_accel & MM_ACCEL_MLIB) {
1228         fprintf (stderr, "Using mlib for IMDCT transform\n");
1229         imdct_512 = imdct_do_512_mlib;
1230         imdct_256 = imdct_do_256_mlib;
1231     } else
1232 #endif
1233     {
1234         int i, j, k;
1235
1236         /* Twiddle factors to turn IFFT into IMDCT */
1237         for (i = 0; i < 128; i++) {
1238             xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1239             xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1240         }
1241 #ifdef ARCH_X86
1242         for (i = 0; i < 128; i++) {
1243             sseSinCos1c[2*i+0]= xcos1[i];
1244             sseSinCos1c[2*i+1]= -xcos1[i];
1245             sseSinCos1d[2*i+0]= xsin1[i];
1246             sseSinCos1d[2*i+1]= xsin1[i];
1247         }
1248 #endif
1249
1250         /* More twiddle factors to turn IFFT into IMDCT */
1251         for (i = 0; i < 64; i++) {
1252             xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1253             xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1254         }
1255
1256         for (i = 0; i < 7; i++) {
1257             j = 1 << i;
1258             for (k = 0; k < j; k++) {
1259                 w[i][k].real = cos (-M_PI * k / j);
1260                 w[i][k].imag = sin (-M_PI * k / j);
1261             }
1262         }
1263 #ifdef ARCH_X86
1264         for (i = 1; i < 7; i++) {
1265             j = 1 << i;
1266             for (k = 0; k < j; k+=2) {
1267
1268                 sseW[i][4*k + 0] = w[i][k+0].real;
1269                 sseW[i][4*k + 1] = w[i][k+0].real;
1270                 sseW[i][4*k + 2] = w[i][k+1].real;
1271                 sseW[i][4*k + 3] = w[i][k+1].real;
1272
1273                 sseW[i][4*k + 4] = -w[i][k+0].imag;
1274                 sseW[i][4*k + 5] = w[i][k+0].imag;
1275                 sseW[i][4*k + 6] = -w[i][k+1].imag;
1276                 sseW[i][4*k + 7] = w[i][k+1].imag;
1277
1278         //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1279                 if(k==0)
1280                 {
1281 //                      sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1282                         sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1283                 }
1284
1285                 if(2*k == j)
1286                 {
1287                         sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1288 //                      sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1289                 }
1290             }
1291         }
1292
1293         for(i=0; i<128; i++)
1294         {
1295                 sseWindow[2*i+0]= -imdct_window[2*i+0];
1296                 sseWindow[2*i+1]=  imdct_window[2*i+1];
1297         }
1298
1299         for(i=0; i<64; i++)
1300         {
1301                 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1302                 sseWindow[256 + 2*i+1]=  imdct_window[254 - 2*i+0];
1303                 sseWindow[384 + 2*i+0]=  imdct_window[126 - 2*i+1];
1304                 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1305         }
1306 #endif // arch_x86
1307
1308         imdct_512 = imdct_do_512;
1309 #ifdef ARCH_X86
1310         if(mm_accel & MM_ACCEL_X86_SSE)
1311         {
1312           fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1313           imdct_512 = imdct_do_512_sse;
1314         }
1315         else
1316         if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1317         {
1318           fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1319           imdct_512 = imdct_do_512_3dnowex;
1320         }
1321         else
1322         if(mm_accel & MM_ACCEL_X86_3DNOW)
1323         {
1324           fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1325           imdct_512 = imdct_do_512_3dnow;
1326         }
1327         else
1328 #endif // arch_x86
1329 #ifdef HAVE_ALTIVEC
1330         if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1331         {
1332           fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1333           imdct_512 = imdct_do_512_altivec;
1334         }
1335         else
1336 #endif
1337         fprintf (stderr, "No accelerated IMDCT transform found\n");
1338         imdct_256 = imdct_do_256;
1339     }
1340 }
1341
1342 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1343              const complex_t *d, const complex_t *d_3)
1344 {
1345   register complex_t  *x2k, *x3k, *x4k, *wB;
1346   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1347
1348   x2k = x + 2 * k;
1349   x3k = x2k + 2 * k;
1350   x4k = x3k + 2 * k;
1351   wB = wTB + 2 * k;
1352
1353   TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1354   TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1355
1356   --k;
1357   for(;;) {
1358      TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1359      TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1360      if (!--k) break;
1361      x += 2;
1362      x2k += 2;
1363      x3k += 2;
1364      x4k += 2;
1365      d += 2;
1366      d_3 += 2;
1367      wTB += 2;
1368      wB += 2;
1369   }
1370
1371 }
1372
1373 static void fft_asmb16(complex_t *x, complex_t *wTB)
1374 {
1375   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1376   int k = 2;
1377
1378   /* transform x[0], x[8], x[4], x[12] */
1379   TRANSZERO(x[0],x[4],x[8],x[12]);
1380
1381   /* transform x[1], x[9], x[5], x[13] */
1382   TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1383
1384   /* transform x[2], x[10], x[6], x[14] */
1385   TRANSHALF_16(x[2],x[6],x[10],x[14]);
1386
1387   /* transform x[3], x[11], x[7], x[15] */
1388   TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1389
1390 }
1391
1392 static void fft_4(complex_t *x)
1393 {
1394   /* delta_p = 1 here */
1395   /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1396    */
1397
1398   register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1399
1400   yt_r = x[0].real;
1401   yb_r = yt_r - x[2].real;
1402   yt_r += x[2].real;
1403
1404   u_r = x[1].real;
1405   vi_i = x[3].real - u_r;
1406   u_r += x[3].real;
1407
1408   u_i = x[1].imag;
1409   vi_r = u_i - x[3].imag;
1410   u_i += x[3].imag;
1411
1412   yt_i = yt_r;
1413   yt_i += u_r;
1414   x[0].real = yt_i;
1415   yt_r -= u_r;
1416   x[2].real = yt_r;
1417   yt_i = yb_r;
1418   yt_i += vi_r;
1419   x[1].real = yt_i;
1420   yb_r -= vi_r;
1421   x[3].real = yb_r;
1422
1423   yt_i = x[0].imag;
1424   yb_i = yt_i - x[2].imag;
1425   yt_i += x[2].imag;
1426
1427   yt_r = yt_i;
1428   yt_r += u_i;
1429   x[0].imag = yt_r;
1430   yt_i -= u_i;
1431   x[2].imag = yt_i;
1432   yt_r = yb_i;
1433   yt_r += vi_i;
1434   x[1].imag = yt_r;
1435   yb_i -= vi_i;
1436   x[3].imag = yb_i;
1437 }
1438
1439
1440 static void fft_8(complex_t *x)
1441 {
1442   /* delta_p = diag{1, sqrt(i)} here */
1443   /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1444    */
1445   register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1446
1447   wT1_r = x[1].real;
1448   wT1_i = x[1].imag;
1449   wB1_r = x[3].real;
1450   wB1_i = x[3].imag;
1451
1452   x[1] = x[2];
1453   x[2] = x[4];
1454   x[3] = x[6];
1455   fft_4(&x[0]);
1456
1457
1458   /* x[0] x[4] */
1459   wT2_r = x[5].real;
1460   wT2_r += x[7].real;
1461   wT2_r += wT1_r;
1462   wT2_r += wB1_r;
1463   wT2_i = wT2_r;
1464   wT2_r += x[0].real;
1465   wT2_i = x[0].real - wT2_i;
1466   x[0].real = wT2_r;
1467   x[4].real = wT2_i;
1468
1469   wT2_i = x[5].imag;
1470   wT2_i += x[7].imag;
1471   wT2_i += wT1_i;
1472   wT2_i += wB1_i;
1473   wT2_r = wT2_i;
1474   wT2_r += x[0].imag;
1475   wT2_i = x[0].imag - wT2_i;
1476   x[0].imag = wT2_r;
1477   x[4].imag = wT2_i;
1478
1479   /* x[2] x[6] */
1480   wT2_r = x[5].imag;
1481   wT2_r -= x[7].imag;
1482   wT2_r += wT1_i;
1483   wT2_r -= wB1_i;
1484   wT2_i = wT2_r;
1485   wT2_r += x[2].real;
1486   wT2_i = x[2].real - wT2_i;
1487   x[2].real = wT2_r;
1488   x[6].real = wT2_i;
1489
1490   wT2_i = x[5].real;
1491   wT2_i -= x[7].real;
1492   wT2_i += wT1_r;
1493   wT2_i -= wB1_r;
1494   wT2_r = wT2_i;
1495   wT2_r += x[2].imag;
1496   wT2_i = x[2].imag - wT2_i;
1497   x[2].imag = wT2_i;
1498   x[6].imag = wT2_r;
1499
1500
1501   /* x[1] x[5] */
1502   wT2_r = wT1_r;
1503   wT2_r += wB1_i;
1504   wT2_r -= x[5].real;
1505   wT2_r -= x[7].imag;
1506   wT2_i = wT1_i;
1507   wT2_i -= wB1_r;
1508   wT2_i -= x[5].imag;
1509   wT2_i += x[7].real;
1510
1511   wB2_r = wT2_r;
1512   wB2_r += wT2_i;
1513   wT2_i -= wT2_r;
1514   wB2_r *= HSQRT2;
1515   wT2_i *= HSQRT2;
1516   wT2_r = wB2_r;
1517   wB2_r += x[1].real;
1518   wT2_r =  x[1].real - wT2_r;
1519
1520   wB2_i = x[5].real;
1521   x[1].real = wB2_r;
1522   x[5].real = wT2_r;
1523
1524   wT2_r = wT2_i;
1525   wT2_r += x[1].imag;
1526   wT2_i = x[1].imag - wT2_i;
1527   wB2_r = x[5].imag;
1528   x[1].imag = wT2_r;
1529   x[5].imag = wT2_i;
1530
1531   /* x[3] x[7] */
1532   wT1_r -= wB1_i;
1533   wT1_i += wB1_r;
1534   wB1_r = wB2_i - x[7].imag;
1535   wB1_i = wB2_r + x[7].real;
1536   wT1_r -= wB1_r;
1537   wT1_i -= wB1_i;
1538   wB1_r = wT1_r + wT1_i;
1539   wB1_r *= HSQRT2;
1540   wT1_i -= wT1_r;
1541   wT1_i *= HSQRT2;
1542   wB2_r = x[3].real;
1543   wB2_i = wB2_r + wT1_i;
1544   wB2_r -= wT1_i;
1545   x[3].real = wB2_i;
1546   x[7].real = wB2_r;
1547   wB2_i = x[3].imag;
1548   wB2_r = wB2_i + wB1_r;
1549   wB2_i -= wB1_r;
1550   x[3].imag = wB2_i;
1551   x[7].imag = wB2_r;
1552 }
1553
1554
1555 static void fft_128p(complex_t *a)
1556 {
1557   fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1558   fft_asmb16(&a[0], &a[8]);
1559
1560   fft_8(&a[16]), fft_8(&a[24]);
1561   fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1562
1563   fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1564   fft_asmb16(&a[32], &a[40]);
1565
1566   fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1567   fft_asmb16(&a[48], &a[56]);
1568
1569   fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1570
1571   fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1572   /* fft_16(&a[64]); */
1573   fft_asmb16(&a[64], &a[72]);
1574
1575   fft_8(&a[80]); fft_8(&a[88]);
1576
1577   /* fft_32(&a[64]); */
1578   fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1579
1580   fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1581   /* fft_16(&a[96]); */
1582   fft_asmb16(&a[96], &a[104]);
1583
1584   fft_8(&a[112]), fft_8(&a[120]);
1585   /* fft_32(&a[96]); */
1586   fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1587
1588   /* fft_128(&a[0]); */
1589   fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
1590 }
1591
1592
1593