liba52/imdct.c

   1 /*
   2  * imdct.c
   3  * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
   4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   5  *
   6  * This file is part of a52dec, a free ATSC A-52 stream decoder.
   7  * See http://liba52.sourceforge.net/ for updates.
   8  *
   9  * Modified for use with MPlayer, changes contained in liba52_changes.diff.
  10  * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
  11  * $Id$
  12  *
  13  * a52dec is free software; you can redistribute it and/or modify
  14  * it under the terms of the GNU General Public License as published by
  15  * the Free Software Foundation; either version 2 of the License, or
  16  * (at your option) any later version.
  17  *
  18  * a52dec is distributed in the hope that it will be useful,
  19  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  20  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21  * GNU General Public License for more details.
  22  *
  23  * You should have received a copy of the GNU General Public License
  24  * along with this program; if not, write to the Free Software
  25  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  26  *
  27  * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
  28  * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
  29  *   michael did port them from libac3 (untested, perhaps totally broken)
  30  * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
  31  */
  32
  33 #include "config.h"
  34
  35 #include <math.h>
  36 #include <stdio.h>
  37 #ifndef M_PI
  38 #define M_PI 3.1415926535897932384626433832795029
  39 #endif
  40 #include <inttypes.h>
  41
  42 #include "a52.h"
  43 #include "a52_internal.h"
  44 #include "mm_accel.h"
  45 #include "mangle.h"
  46
  47 #ifdef RUNTIME_CPUDETECT
  48 #undef HAVE_3DNOWEX
  49 #endif
  50
  51 #define USE_AC3_C
  52
  53 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
  54 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
  55
  56 typedef struct complex_s {
  57     sample_t real;
  58     sample_t imag;
  59 } complex_t;
  60
  61 static void fft_128p(complex_t *a);
  62
  63 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
  64 {
  65         0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
  66         4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
  67         2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
  68         6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
  69         1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
  70         5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
  71         3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
  72         7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
  73 };
  74
  75 /* 128 point bit-reverse LUT */
  76 static uint8_t attribute_used bit_reverse_512[] = {
  77         0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
  78         0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
  79         0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
  80         0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
  81         0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
  82         0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
  83         0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
  84         0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
  85         0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
  86         0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
  87         0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
  88         0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
  89         0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
  90         0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
  91         0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
  92         0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
  93
  94 static uint8_t bit_reverse_256[] = {
  95         0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
  96         0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
  97         0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
  98         0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
  99         0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
 100         0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
 101         0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
 102         0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
 103
 104 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 105 // NOTE: SSE needs 16byte alignment or it will segfault
 106 //
 107 static complex_t __attribute__((aligned(16))) buf[128];
 108 static float __attribute__((aligned(16))) sseSinCos1c[256];
 109 static float __attribute__((aligned(16))) sseSinCos1d[256];
 110 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
 111 //static float __attribute__((aligned(16))) sseW0[4];
 112 static float __attribute__((aligned(16))) sseW1[8];
 113 static float __attribute__((aligned(16))) sseW2[16];
 114 static float __attribute__((aligned(16))) sseW3[32];
 115 static float __attribute__((aligned(16))) sseW4[64];
 116 static float __attribute__((aligned(16))) sseW5[128];
 117 static float __attribute__((aligned(16))) sseW6[256];
 118 static float __attribute__((aligned(16))) *sseW[7]=
 119         {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 120 static float __attribute__((aligned(16))) sseWindow[512];
 121 #else
 122 static complex_t  __attribute__((aligned(16))) buf[128];
 123 #endif
 124
 125 /* Twiddle factor LUT */
 126 static complex_t __attribute__((aligned(16))) w_1[1];
 127 static complex_t __attribute__((aligned(16))) w_2[2];
 128 static complex_t __attribute__((aligned(16))) w_4[4];
 129 static complex_t __attribute__((aligned(16))) w_8[8];
 130 static complex_t __attribute__((aligned(16))) w_16[16];
 131 static complex_t __attribute__((aligned(16))) w_32[32];
 132 static complex_t __attribute__((aligned(16))) w_64[64];
 133 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
 134
 135 /* Twiddle factors for IMDCT */
 136 static sample_t __attribute__((aligned(16))) xcos1[128];
 137 static sample_t __attribute__((aligned(16))) xsin1[128];
 138 static sample_t __attribute__((aligned(16))) xcos2[64];
 139 static sample_t __attribute__((aligned(16))) xsin2[64];
 140
 141 /* Windowing function for Modified DCT - Thank you acroread */
 142 sample_t imdct_window[] = {
 143         0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
 144         0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
 145         0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
 146         0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
 147         0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
 148         0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
 149         0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
 150         0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
 151         0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
 152         0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
 153         0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
 154         0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
 155         0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
 156         0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
 157         0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
 158         0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
 159         0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
 160         0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
 161         0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
 162         0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
 163         0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
 164         0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
 165         0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
 166         0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
 167         0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
 168         0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
 169         0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
 170         0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
 171         0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
 172         0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
 173         0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
 174         1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
 175
 176
 177 static inline void swap_cmplx(complex_t *a, complex_t *b)
 178 {
 179     complex_t tmp;
 180
 181     tmp = *a;
 182     *a = *b;
 183     *b = tmp;
 184 }
 185
 186
 187
 188 static inline complex_t cmplx_mult(complex_t a, complex_t b)
 189 {
 190     complex_t ret;
 191
 192     ret.real = a.real * b.real - a.imag * b.imag;
 193     ret.imag = a.real * b.imag + a.imag * b.real;
 194
 195     return ret;
 196 }
 197
 198 void
 199 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
 200 {
 201     int i;
 202 #ifndef USE_AC3_C
 203         int k;
 204     int p,q;
 205     int m;
 206     int two_m;
 207     int two_m_plus_one;
 208
 209     sample_t tmp_b_i;
 210     sample_t tmp_b_r;
 211 #endif
 212     sample_t tmp_a_i;
 213     sample_t tmp_a_r;
 214
 215     sample_t *data_ptr;
 216     sample_t *delay_ptr;
 217     sample_t *window_ptr;
 218
 219     /* 512 IMDCT with source and dest data in 'data' */
 220
 221     /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 222     for( i=0; i < 128; i++) {
 223         /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 224 #ifdef USE_AC3_C
 225         int j= pm128[i];
 226 #else
 227         int j= bit_reverse_512[i];
 228 #endif
 229         buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 230         buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 231     }
 232
 233     /* FFT Merge */
 234 /* unoptimized variant
 235     for (m=1; m < 7; m++) {
 236         if(m)
 237             two_m = (1 << m);
 238         else
 239             two_m = 1;
 240
 241         two_m_plus_one = (1 << (m+1));
 242
 243         for(i = 0; i < 128; i += two_m_plus_one) {
 244             for(k = 0; k < two_m; k++) {
 245                 p = k + i;
 246                 q = p + two_m;
 247                 tmp_a_r = buf[p].real;
 248                 tmp_a_i = buf[p].imag;
 249                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 250                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 251                 buf[p].real = tmp_a_r + tmp_b_r;
 252                 buf[p].imag =  tmp_a_i + tmp_b_i;
 253                 buf[q].real = tmp_a_r - tmp_b_r;
 254                 buf[q].imag =  tmp_a_i - tmp_b_i;
 255             }
 256         }
 257     }
 258 */
 259 #ifdef USE_AC3_C
 260         fft_128p (&buf[0]);
 261 #else
 262
 263     /* 1. iteration */
 264     for(i = 0; i < 128; i += 2) {
 265         tmp_a_r = buf[i].real;
 266         tmp_a_i = buf[i].imag;
 267         tmp_b_r = buf[i+1].real;
 268         tmp_b_i = buf[i+1].imag;
 269         buf[i].real = tmp_a_r + tmp_b_r;
 270         buf[i].imag =  tmp_a_i + tmp_b_i;
 271         buf[i+1].real = tmp_a_r - tmp_b_r;
 272         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 273     }
 274
 275     /* 2. iteration */
 276         // Note w[1]={{1,0}, {0,-1}}
 277     for(i = 0; i < 128; i += 4) {
 278         tmp_a_r = buf[i].real;
 279         tmp_a_i = buf[i].imag;
 280         tmp_b_r = buf[i+2].real;
 281         tmp_b_i = buf[i+2].imag;
 282         buf[i].real = tmp_a_r + tmp_b_r;
 283         buf[i].imag =  tmp_a_i + tmp_b_i;
 284         buf[i+2].real = tmp_a_r - tmp_b_r;
 285         buf[i+2].imag =  tmp_a_i - tmp_b_i;
 286         tmp_a_r = buf[i+1].real;
 287         tmp_a_i = buf[i+1].imag;
 288         tmp_b_r = buf[i+3].imag;
 289         tmp_b_i = buf[i+3].real;
 290         buf[i+1].real = tmp_a_r + tmp_b_r;
 291         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 292         buf[i+3].real = tmp_a_r - tmp_b_r;
 293         buf[i+3].imag =  tmp_a_i + tmp_b_i;
 294     }
 295
 296     /* 3. iteration */
 297     for(i = 0; i < 128; i += 8) {
 298                 tmp_a_r = buf[i].real;
 299                 tmp_a_i = buf[i].imag;
 300                 tmp_b_r = buf[i+4].real;
 301                 tmp_b_i = buf[i+4].imag;
 302                 buf[i].real = tmp_a_r + tmp_b_r;
 303                 buf[i].imag =  tmp_a_i + tmp_b_i;
 304                 buf[i+4].real = tmp_a_r - tmp_b_r;
 305                 buf[i+4].imag =  tmp_a_i - tmp_b_i;
 306                 tmp_a_r = buf[1+i].real;
 307                 tmp_a_i = buf[1+i].imag;
 308                 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 309                 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 310                 buf[1+i].real = tmp_a_r + tmp_b_r;
 311                 buf[1+i].imag =  tmp_a_i + tmp_b_i;
 312                 buf[i+5].real = tmp_a_r - tmp_b_r;
 313                 buf[i+5].imag =  tmp_a_i - tmp_b_i;
 314                 tmp_a_r = buf[i+2].real;
 315                 tmp_a_i = buf[i+2].imag;
 316                 tmp_b_r = buf[i+6].imag;
 317                 tmp_b_i = - buf[i+6].real;
 318                 buf[i+2].real = tmp_a_r + tmp_b_r;
 319                 buf[i+2].imag =  tmp_a_i + tmp_b_i;
 320                 buf[i+6].real = tmp_a_r - tmp_b_r;
 321                 buf[i+6].imag =  tmp_a_i - tmp_b_i;
 322                 tmp_a_r = buf[i+3].real;
 323                 tmp_a_i = buf[i+3].imag;
 324                 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 325                 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 326                 buf[i+3].real = tmp_a_r + tmp_b_r;
 327                 buf[i+3].imag =  tmp_a_i + tmp_b_i;
 328                 buf[i+7].real = tmp_a_r - tmp_b_r;
 329                 buf[i+7].imag =  tmp_a_i - tmp_b_i;
 330      }
 331
 332     /* 4-7. iterations */
 333     for (m=3; m < 7; m++) {
 334         two_m = (1 << m);
 335
 336         two_m_plus_one = two_m<<1;
 337
 338         for(i = 0; i < 128; i += two_m_plus_one) {
 339             for(k = 0; k < two_m; k++) {
 340                 int p = k + i;
 341                 int q = p + two_m;
 342                 tmp_a_r = buf[p].real;
 343                 tmp_a_i = buf[p].imag;
 344                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 345                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 346                 buf[p].real = tmp_a_r + tmp_b_r;
 347                 buf[p].imag =  tmp_a_i + tmp_b_i;
 348                 buf[q].real = tmp_a_r - tmp_b_r;
 349                 buf[q].imag =  tmp_a_i - tmp_b_i;
 350             }
 351         }
 352     }
 353 #endif
 354     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 355     for( i=0; i < 128; i++) {
 356         /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 357         tmp_a_r =        buf[i].real;
 358         tmp_a_i = -1.0 * buf[i].imag;
 359         buf[i].real =(tmp_a_r * xcos1[i])  -  (tmp_a_i  * xsin1[i]);
 360         buf[i].imag =(tmp_a_r * xsin1[i])  +  (tmp_a_i  * xcos1[i]);
 361     }
 362
 363     data_ptr = data;
 364     delay_ptr = delay;
 365     window_ptr = imdct_window;
 366
 367     /* Window and convert to real valued signal */
 368     for(i=0; i< 64; i++) {
 369         *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 370         *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 371     }
 372
 373     for(i=0; i< 64; i++) {
 374         *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 375         *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 376     }
 377
 378     /* The trailing edge of the window goes into the delay line */
 379     delay_ptr = delay;
 380
 381     for(i=0; i< 64; i++) {
 382         *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 383         *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 384     }
 385
 386     for(i=0; i<64; i++) {
 387         *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 388         *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 389     }
 390 }
 391
 392 #ifdef HAVE_ALTIVEC
 393
 394 #ifndef SYS_DARWIN
 395 #include <altivec.h>
 396 #endif
 397
 398 // used to build registers permutation vectors (vcprm)
 399 // the 's' are for words in the _s_econd vector
 400 #define WORD_0 0x00,0x01,0x02,0x03
 401 #define WORD_1 0x04,0x05,0x06,0x07
 402 #define WORD_2 0x08,0x09,0x0a,0x0b
 403 #define WORD_3 0x0c,0x0d,0x0e,0x0f
 404 #define WORD_s0 0x10,0x11,0x12,0x13
 405 #define WORD_s1 0x14,0x15,0x16,0x17
 406 #define WORD_s2 0x18,0x19,0x1a,0x1b
 407 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
 408
 409 #ifdef SYS_DARWIN
 410 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
 411 #else
 412 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
 413 #endif
 414
 415 // vcprmle is used to keep the same index as in the SSE version.
 416 // it's the same as vcprm, with the index inversed
 417 // ('le' is Little Endian)
 418 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
 419
 420 // used to build inverse/identity vectors (vcii)
 421 // n is _n_egative, p is _p_ositive
 422 #define FLOAT_n -1.
 423 #define FLOAT_p 1.
 424
 425 #ifdef SYS_DARWIN
 426 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
 427 #else
 428 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 429 #endif
 430
 431 #ifdef SYS_DARWIN
 432 #define FOUROF(a) (a)
 433 #else
 434 #define FOUROF(a) {a,a,a,a}
 435 #endif
 436
 437
 438 void
 439 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
 440 {
 441   int i;
 442   int k;
 443   int p,q;
 444   int m;
 445   long two_m;
 446   long two_m_plus_one;
 447
 448   sample_t tmp_b_i;
 449   sample_t tmp_b_r;
 450   sample_t tmp_a_i;
 451   sample_t tmp_a_r;
 452
 453   sample_t *data_ptr;
 454   sample_t *delay_ptr;
 455   sample_t *window_ptr;
 456
 457   /* 512 IMDCT with source and dest data in 'data' */
 458
 459   /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 460   for( i=0; i < 128; i++) {
 461     /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 462     int j= bit_reverse_512[i];
 463     buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 464     buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 465   }
 466
 467   /* 1. iteration */
 468   for(i = 0; i < 128; i += 2) {
 469 #if 0
 470     tmp_a_r = buf[i].real;
 471     tmp_a_i = buf[i].imag;
 472     tmp_b_r = buf[i+1].real;
 473     tmp_b_i = buf[i+1].imag;
 474     buf[i].real = tmp_a_r + tmp_b_r;
 475     buf[i].imag =  tmp_a_i + tmp_b_i;
 476     buf[i+1].real = tmp_a_r - tmp_b_r;
 477     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 478 #else
 479     vector float temp, bufv;
 480
 481     bufv = vec_ld(i << 3, (float*)buf);
 482     temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
 483     bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
 484     vec_st(bufv, i << 3, (float*)buf);
 485 #endif
 486   }
 487
 488   /* 2. iteration */
 489   // Note w[1]={{1,0}, {0,-1}}
 490   for(i = 0; i < 128; i += 4) {
 491 #if 0
 492     tmp_a_r = buf[i].real;
 493     tmp_a_i = buf[i].imag;
 494     tmp_b_r = buf[i+2].real;
 495     tmp_b_i = buf[i+2].imag;
 496     buf[i].real = tmp_a_r + tmp_b_r;
 497     buf[i].imag =  tmp_a_i + tmp_b_i;
 498     buf[i+2].real = tmp_a_r - tmp_b_r;
 499     buf[i+2].imag =  tmp_a_i - tmp_b_i;
 500     tmp_a_r = buf[i+1].real;
 501     tmp_a_i = buf[i+1].imag;
 502     /* WARNING: im <-> re here ! */
 503     tmp_b_r = buf[i+3].imag;
 504     tmp_b_i = buf[i+3].real;
 505     buf[i+1].real = tmp_a_r + tmp_b_r;
 506     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 507     buf[i+3].real = tmp_a_r - tmp_b_r;
 508     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 509 #else
 510     vector float buf01, buf23, temp1, temp2;
 511
 512     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 513     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 514     buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
 515
 516     temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
 517     temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
 518
 519     vec_st(temp1, (i + 0) << 3, (float*)buf);
 520     vec_st(temp2, (i + 2) << 3, (float*)buf);
 521 #endif
 522   }
 523
 524   /* 3. iteration */
 525   for(i = 0; i < 128; i += 8) {
 526 #if 0
 527     tmp_a_r = buf[i].real;
 528     tmp_a_i = buf[i].imag;
 529     tmp_b_r = buf[i+4].real;
 530     tmp_b_i = buf[i+4].imag;
 531     buf[i].real = tmp_a_r + tmp_b_r;
 532     buf[i].imag =  tmp_a_i + tmp_b_i;
 533     buf[i+4].real = tmp_a_r - tmp_b_r;
 534     buf[i+4].imag =  tmp_a_i - tmp_b_i;
 535     tmp_a_r = buf[1+i].real;
 536     tmp_a_i = buf[1+i].imag;
 537     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 538     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 539     buf[1+i].real = tmp_a_r + tmp_b_r;
 540     buf[1+i].imag =  tmp_a_i + tmp_b_i;
 541     buf[i+5].real = tmp_a_r - tmp_b_r;
 542     buf[i+5].imag =  tmp_a_i - tmp_b_i;
 543     tmp_a_r = buf[i+2].real;
 544     tmp_a_i = buf[i+2].imag;
 545     /* WARNING re <-> im & sign */
 546     tmp_b_r = buf[i+6].imag;
 547     tmp_b_i = - buf[i+6].real;
 548     buf[i+2].real = tmp_a_r + tmp_b_r;
 549     buf[i+2].imag =  tmp_a_i + tmp_b_i;
 550     buf[i+6].real = tmp_a_r - tmp_b_r;
 551     buf[i+6].imag =  tmp_a_i - tmp_b_i;
 552     tmp_a_r = buf[i+3].real;
 553     tmp_a_i = buf[i+3].imag;
 554     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 555     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 556     buf[i+3].real = tmp_a_r + tmp_b_r;
 557     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 558     buf[i+7].real = tmp_a_r - tmp_b_r;
 559     buf[i+7].imag =  tmp_a_i - tmp_b_i;
 560 #else
 561     vector float buf01, buf23, buf45, buf67;
 562
 563     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 564     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 565
 566     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 567     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 568     buf[i+5].real = tmp_b_r;
 569     buf[i+5].imag = tmp_b_i;
 570     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 571     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 572     buf[i+7].real = tmp_b_r;
 573     buf[i+7].imag = tmp_b_i;
 574
 575     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 576     buf45 = vec_ld((i + 4) << 3, (float*)buf);
 577     buf67 = vec_ld((i + 6) << 3, (float*)buf);
 578     buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
 579
 580     vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
 581     vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
 582     vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
 583     vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
 584 #endif
 585   }
 586
 587   /* 4-7. iterations */
 588   for (m=3; m < 7; m++) {
 589     two_m = (1 << m);
 590
 591     two_m_plus_one = two_m<<1;
 592
 593     for(i = 0; i < 128; i += two_m_plus_one) {
 594       for(k = 0; k < two_m; k+=2) {
 595 #if 0
 596         int p = k + i;
 597         int q = p + two_m;
 598         tmp_a_r = buf[p].real;
 599         tmp_a_i = buf[p].imag;
 600         tmp_b_r =
 601           buf[q].real * w[m][k].real -
 602           buf[q].imag * w[m][k].imag;
 603         tmp_b_i =
 604           buf[q].imag * w[m][k].real +
 605           buf[q].real * w[m][k].imag;
 606         buf[p].real = tmp_a_r + tmp_b_r;
 607         buf[p].imag =  tmp_a_i + tmp_b_i;
 608         buf[q].real = tmp_a_r - tmp_b_r;
 609         buf[q].imag =  tmp_a_i - tmp_b_i;
 610
 611         tmp_a_r = buf[(p + 1)].real;
 612         tmp_a_i = buf[(p + 1)].imag;
 613         tmp_b_r =
 614           buf[(q + 1)].real * w[m][(k + 1)].real -
 615           buf[(q + 1)].imag * w[m][(k + 1)].imag;
 616         tmp_b_i =
 617           buf[(q + 1)].imag * w[m][(k + 1)].real +
 618           buf[(q + 1)].real * w[m][(k + 1)].imag;
 619         buf[(p + 1)].real = tmp_a_r + tmp_b_r;
 620         buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
 621         buf[(q + 1)].real = tmp_a_r - tmp_b_r;
 622         buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
 623 #else
 624         int p = k + i;
 625         int q = p + two_m;
 626         vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
 627         const vector float vczero = (const vector float)FOUROF(0.);
 628         // first compute buf[q] and buf[q+1]
 629         vecq = vec_ld(q << 3, (float*)buf);
 630         vecw = vec_ld(0, (float*)&(w[m][k]));
 631         temp1 = vec_madd(vecq, vecw, vczero);
 632         temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
 633         temp2 = vec_madd(temp2, vecw, vczero);
 634         temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
 635         temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
 636         vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
 637         // then butterfly with buf[p] and buf[p+1]
 638         vecp = vec_ld(p << 3, (float*)buf);
 639
 640         temp1 = vec_add(vecp, vecq);
 641         temp2 = vec_sub(vecp, vecq);
 642
 643         vec_st(temp1, p << 3, (float*)buf);
 644         vec_st(temp2, q << 3, (float*)buf);
 645 #endif
 646       }
 647     }
 648   }
 649
 650   /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 651   for( i=0; i < 128; i+=4) {
 652     /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 653 #if 0
 654     tmp_a_r =        buf[(i + 0)].real;
 655     tmp_a_i = -1.0 * buf[(i + 0)].imag;
 656     buf[(i + 0)].real =
 657       (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
 658     buf[(i + 0)].imag =
 659       (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
 660
 661     tmp_a_r =        buf[(i + 1)].real;
 662     tmp_a_i = -1.0 * buf[(i + 1)].imag;
 663     buf[(i + 1)].real =
 664       (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
 665     buf[(i + 1)].imag =
 666       (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
 667
 668     tmp_a_r =        buf[(i + 2)].real;
 669     tmp_a_i = -1.0 * buf[(i + 2)].imag;
 670     buf[(i + 2)].real =
 671       (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
 672     buf[(i + 2)].imag =
 673       (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
 674
 675     tmp_a_r =        buf[(i + 3)].real;
 676     tmp_a_i = -1.0 * buf[(i + 3)].imag;
 677     buf[(i + 3)].real =
 678       (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
 679     buf[(i + 3)].imag =
 680       (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
 681 #else
 682     vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
 683     vector float temp0022, temp1133, tempCS01;
 684     const vector float vczero = (const vector float)FOUROF(0.);
 685
 686     bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
 687     bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
 688
 689     cosv = vec_ld(i << 2, xcos1);
 690     sinv = vec_ld(i << 2, xsin1);
 691
 692     temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
 693     temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
 694     tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
 695     temp1 = vec_madd(temp0022, tempCS01, vczero);
 696     tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
 697     temp2 = vec_madd(temp1133, tempCS01, vczero);
 698     bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 699
 700     vec_st(bufv_0, (i + 0) << 3, (float*)buf);
 701
 702     /* idem with bufv_2 and high-order cosv/sinv */
 703
 704     temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
 705     temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
 706     tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
 707     temp1 = vec_madd(temp0022, tempCS01, vczero);
 708     tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
 709     temp2 = vec_madd(temp1133, tempCS01, vczero);
 710     bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 711
 712     vec_st(bufv_2, (i + 2) << 3, (float*)buf);
 713
 714 #endif
 715   }
 716
 717   data_ptr = data;
 718   delay_ptr = delay;
 719   window_ptr = imdct_window;
 720
 721   /* Window and convert to real valued signal */
 722   for(i=0; i< 64; i++) {
 723     *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 724     *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 725   }
 726
 727   for(i=0; i< 64; i++) {
 728     *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 729     *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 730   }
 731
 732   /* The trailing edge of the window goes into the delay line */
 733   delay_ptr = delay;
 734
 735   for(i=0; i< 64; i++) {
 736     *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 737     *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 738   }
 739
 740   for(i=0; i<64; i++) {
 741     *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 742     *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 743   }
 744 }
 745 #endif
 746
 747
 748 // Stuff below this line is borrowed from libac3
 749 #include "srfftp.h"
 750 #if defined(ARCH_X86) || defined(ARCH_X86_64)
 751 #ifndef HAVE_3DNOW
 752 #define HAVE_3DNOW 1
 753 #endif
 754 #include "srfftp_3dnow.h"
 755
 756 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
 757 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
 758 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
 759
 760 #undef HAVE_3DNOWEX
 761 #include "imdct_3dnow.h"
 762 #define HAVE_3DNOWEX
 763 #include "imdct_3dnow.h"
 764
 765 void
 766 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 767 {
 768 /*      int i,k;
 769     int p,q;*/
 770     int m;
 771     long two_m;
 772     long two_m_plus_one;
 773     long two_m_plus_one_shl3;
 774     complex_t *buf_offset;
 775
 776 /*  sample_t tmp_a_i;
 777     sample_t tmp_a_r;
 778     sample_t tmp_b_i;
 779     sample_t tmp_b_r;*/
 780
 781     sample_t *data_ptr;
 782     sample_t *delay_ptr;
 783     sample_t *window_ptr;
 784
 785     /* 512 IMDCT with source and dest data in 'data' */
 786     /* see the c version (dct_do_512()), its allmost identical, just in C */
 787
 788     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 789     /* Bit reversed shuffling */
 790         asm volatile(
 791                 "xor %%"REG_S", %%"REG_S"               \n\t"
 792                 "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
 793                 "mov $1008, %%"REG_D"                   \n\t"
 794                 "push %%"REG_BP"                        \n\t" //use ebp without telling gcc
 795                 ".balign 16                             \n\t"
 796                 "1:                                     \n\t"
 797                 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
 798                 "movhps 8(%0, %%"REG_D"), %%xmm0        \n\t" // RXXI
 799                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // XXXi
 800                 "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
 801                 "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
 802                 "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
 803                 "mulps %%xmm0, %%xmm2                   \n\t"
 804                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
 805                 "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 806                 "subps %%xmm0, %%xmm2                   \n\t"
 807                 "movzb (%%"REG_a"), %%"REG_d"           \n\t"
 808                 "movzb 1(%%"REG_a"), %%"REG_BP"         \n\t"
 809                 "movlps %%xmm2, (%1, %%"REG_d", 8)      \n\t"
 810                 "movhps %%xmm2, (%1, %%"REG_BP", 8)     \n\t"
 811                 "add $16, %%"REG_S"                     \n\t"
 812                 "add $2, %%"REG_a"                      \n\t" // avoid complex addressing for P4 crap
 813                 "sub $16, %%"REG_D"                     \n\t"
 814                 "jnc 1b                                 \n\t"
 815                 "pop %%"REG_BP"                         \n\t"//no we didnt touch ebp *g*
 816                 :: "r" (data), "r" (buf)
 817                 : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
 818         );
 819
 820
 821     /* FFT Merge */
 822 /* unoptimized variant
 823     for (m=1; m < 7; m++) {
 824         if(m)
 825             two_m = (1 << m);
 826         else
 827             two_m = 1;
 828
 829         two_m_plus_one = (1 << (m+1));
 830
 831         for(i = 0; i < 128; i += two_m_plus_one) {
 832             for(k = 0; k < two_m; k++) {
 833                 p = k + i;
 834                 q = p + two_m;
 835                 tmp_a_r = buf[p].real;
 836                 tmp_a_i = buf[p].imag;
 837                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 838                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 839                 buf[p].real = tmp_a_r + tmp_b_r;
 840                 buf[p].imag =  tmp_a_i + tmp_b_i;
 841                 buf[q].real = tmp_a_r - tmp_b_r;
 842                 buf[q].imag =  tmp_a_i - tmp_b_i;
 843             }
 844         }
 845     }
 846 */
 847
 848     /* 1. iteration */
 849         // Note w[0][0]={1,0}
 850         asm volatile(
 851                 "xorps %%xmm1, %%xmm1   \n\t"
 852                 "xorps %%xmm2, %%xmm2   \n\t"
 853                 "mov %0, %%"REG_S"      \n\t"
 854                 ".balign 16                             \n\t"
 855                 "1:                     \n\t"
 856                 "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
 857                 "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
 858                 "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
 859                 "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
 860                 "addps %%xmm1, %%xmm0   \n\t"
 861                 "subps %%xmm2, %%xmm0   \n\t"
 862                 "movaps %%xmm0, (%%"REG_S")\n\t"
 863                 "add $16, %%"REG_S"     \n\t"
 864                 "cmp %1, %%"REG_S"      \n\t"
 865                 " jb 1b                 \n\t"
 866                 :: "g" (buf), "r" (buf + 128)
 867                 : "%"REG_S
 868         );
 869
 870     /* 2. iteration */
 871         // Note w[1]={{1,0}, {0,-1}}
 872         asm volatile(
 873                 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
 874                 "mov %0, %%"REG_S"              \n\t"
 875                 ".balign 16                             \n\t"
 876                 "1:                             \n\t"
 877                 "movaps 16(%%"REG_S"), %%xmm2   \n\t" //r2,i2,r3,i3
 878                 "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
 879                 "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
 880                 "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
 881                 "movaps (%%"REG_S"), %%xmm1     \n\t" //r0,i0,r1,i1
 882                 "addps %%xmm2, %%xmm0           \n\t"
 883                 "subps %%xmm2, %%xmm1           \n\t"
 884                 "movaps %%xmm0, (%%"REG_S")     \n\t"
 885                 "movaps %%xmm1, 16(%%"REG_S")   \n\t"
 886                 "add $32, %%"REG_S"     \n\t"
 887                 "cmp %1, %%"REG_S"      \n\t"
 888                 " jb 1b                 \n\t"
 889                 :: "g" (buf), "r" (buf + 128)
 890                 : "%"REG_S
 891         );
 892
 893     /* 3. iteration */
 894 /*
 895  Note sseW2+0={1,1,sqrt(2),sqrt(2))
 896  Note sseW2+16={0,0,sqrt(2),-sqrt(2))
 897  Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
 898  Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
 899 */
 900         asm volatile(
 901                 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
 902                 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
 903                 "xorps %%xmm5, %%xmm5           \n\t"
 904                 "xorps %%xmm2, %%xmm2           \n\t"
 905                 "mov %0, %%"REG_S"              \n\t"
 906                 ".balign 16                     \n\t"
 907                 "1:                             \n\t"
 908                 "movaps 32(%%"REG_S"), %%xmm2   \n\t" //r4,i4,r5,i5
 909                 "movaps 48(%%"REG_S"), %%xmm3   \n\t" //r6,i6,r7,i7
 910                 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
 911                 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
 912                 "mulps %%xmm2, %%xmm4           \n\t"
 913                 "mulps %%xmm3, %%xmm5           \n\t"
 914                 "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
 915                 "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
 916                 "mulps %%xmm6, %%xmm3           \n\t"
 917                 "mulps %%xmm7, %%xmm2           \n\t"
 918                 "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
 919                 "movaps 16(%%"REG_S"), %%xmm1   \n\t" //r2,i2,r3,i3
 920                 "addps %%xmm4, %%xmm2           \n\t"
 921                 "addps %%xmm5, %%xmm3           \n\t"
 922                 "movaps %%xmm2, %%xmm4          \n\t"
 923                 "movaps %%xmm3, %%xmm5          \n\t"
 924                 "addps %%xmm0, %%xmm2           \n\t"
 925                 "addps %%xmm1, %%xmm3           \n\t"
 926                 "subps %%xmm4, %%xmm0           \n\t"
 927                 "subps %%xmm5, %%xmm1           \n\t"
 928                 "movaps %%xmm2, (%%"REG_S")     \n\t"
 929                 "movaps %%xmm3, 16(%%"REG_S")   \n\t"
 930                 "movaps %%xmm0, 32(%%"REG_S")   \n\t"
 931                 "movaps %%xmm1, 48(%%"REG_S")   \n\t"
 932                 "add $64, %%"REG_S"     \n\t"
 933                 "cmp %1, %%"REG_S"      \n\t"
 934                 " jb 1b                 \n\t"
 935                 :: "g" (buf), "r" (buf + 128)
 936                 : "%"REG_S
 937         );
 938
 939     /* 4-7. iterations */
 940     for (m=3; m < 7; m++) {
 941         two_m = (1 << m);
 942         two_m_plus_one = two_m<<1;
 943         two_m_plus_one_shl3 = (two_m_plus_one<<3);
 944         buf_offset = buf+128;
 945         asm volatile(
 946                 "mov %0, %%"REG_S"                      \n\t"
 947                 ".balign 16                             \n\t"
 948                 "1:                                     \n\t"
 949                 "xor %%"REG_D", %%"REG_D"               \n\t" // k
 950                 "lea (%%"REG_S", %3), %%"REG_d"         \n\t"
 951                 "2:                                     \n\t"
 952                 "movaps (%%"REG_d", %%"REG_D"), %%xmm1  \n\t"
 953                 "movaps (%4, %%"REG_D", 2), %%xmm2      \n\t"
 954                 "mulps %%xmm1, %%xmm2                   \n\t"
 955                 "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
 956                 "mulps 16(%4, %%"REG_D", 2), %%xmm1     \n\t"
 957                 "movaps (%%"REG_S", %%"REG_D"), %%xmm0  \n\t"
 958                 "addps %%xmm2, %%xmm1                   \n\t"
 959                 "movaps %%xmm1, %%xmm2                  \n\t"
 960                 "addps %%xmm0, %%xmm1                   \n\t"
 961                 "subps %%xmm2, %%xmm0                   \n\t"
 962                 "movaps %%xmm1, (%%"REG_S", %%"REG_D")  \n\t"
 963                 "movaps %%xmm0, (%%"REG_d", %%"REG_D")  \n\t"
 964                 "add $16, %%"REG_D"                     \n\t"
 965                 "cmp %3, %%"REG_D"                      \n\t" //FIXME (opt) count against 0
 966                 "jb 2b                                  \n\t"
 967                 "add %2, %%"REG_S"                      \n\t"
 968                 "cmp %1, %%"REG_S"                      \n\t"
 969                 " jb 1b                                 \n\t"
 970                 :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
 971                    "r" (sseW[m])
 972                 : "%"REG_S, "%"REG_D, "%"REG_d
 973         );
 974     }
 975
 976     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 977         asm volatile(
 978                 "mov $-1024, %%"REG_S"                  \n\t"
 979                 ".balign 16                             \n\t"
 980                 "1:                                     \n\t"
 981                 "movaps (%0, %%"REG_S"), %%xmm0         \n\t"
 982                 "movaps (%0, %%"REG_S"), %%xmm1         \n\t"
 983                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
 984                 "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
 985                 "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
 986                 "addps %%xmm1, %%xmm0                   \n\t"
 987                 "movaps %%xmm0, (%0, %%"REG_S")         \n\t"
 988                 "add $16, %%"REG_S"                     \n\t"
 989                 " jnz 1b                                \n\t"
 990                 :: "r" (buf+128)
 991                 : "%"REG_S
 992         );
 993
 994
 995     data_ptr = data;
 996     delay_ptr = delay;
 997     window_ptr = imdct_window;
 998
 999     /* Window and convert to real valued signal */
1000         asm volatile(
1001                 "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
1002                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1003                 "movss %3, %%xmm2                       \n\t"  // bias
1004                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
1005                 ".balign 16                             \n\t"
1006                 "1:                                     \n\t"
1007                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? A ?
1008                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
1009                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
1010                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
1011                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1012                 "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1013                 "addps (%2, %%"REG_S"), %%xmm0          \n\t"
1014                 "addps %%xmm2, %%xmm0                   \n\t"
1015                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1016                 "add  $16, %%"REG_S"                    \n\t"
1017                 "sub  $16, %%"REG_D"                    \n\t"
1018                 "cmp  $512, %%"REG_S"                   \n\t"
1019                 " jb 1b                                 \n\t"
1020                 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1021                 : "%"REG_S, "%"REG_D
1022         );
1023         data_ptr+=128;
1024         delay_ptr+=128;
1025 //      window_ptr+=128;
1026
1027         asm volatile(
1028                 "mov $1024, %%"REG_D"                   \n\t"  // 512
1029                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1030                 "movss %3, %%xmm2                       \n\t"  // bias
1031                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
1032                 ".balign 16                             \n\t"
1033                 "1:                                     \n\t"
1034                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
1035                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
1036                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
1037                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
1038                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1039                 "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1040                 "addps (%2, %%"REG_S"), %%xmm0          \n\t"
1041                 "addps %%xmm2, %%xmm0                   \n\t"
1042                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1043                 "add $16, %%"REG_S"                     \n\t"
1044                 "sub $16, %%"REG_D"                     \n\t"
1045                 "cmp $512, %%"REG_S"                    \n\t"
1046                 " jb 1b                                 \n\t"
1047                 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1048                 : "%"REG_S, "%"REG_D
1049         );
1050         data_ptr+=128;
1051 //      window_ptr+=128;
1052
1053     /* The trailing edge of the window goes into the delay line */
1054     delay_ptr = delay;
1055
1056         asm volatile(
1057                 "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
1058                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1059                 ".balign 16                             \n\t"
1060                 "1:                                     \n\t"
1061                 "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
1062                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
1063                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
1064                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
1065                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1066                 "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1067                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1068                 "add $16, %%"REG_S"                     \n\t"
1069                 "sub $16, %%"REG_D"                     \n\t"
1070                 "cmp $512, %%"REG_S"                    \n\t"
1071                 " jb 1b                                 \n\t"
1072                 :: "r" (buf+64), "r" (delay_ptr)
1073                 : "%"REG_S, "%"REG_D
1074         );
1075         delay_ptr+=128;
1076 //      window_ptr-=128;
1077
1078         asm volatile(
1079                 "mov $1024, %%"REG_D"                   \n\t"  // 1024
1080                 "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
1081                 ".balign 16                             \n\t"
1082                 "1:                                     \n\t"
1083                 "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
1084                 "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
1085                 "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
1086                 "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
1087                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1088                 "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
1089                 "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
1090                 "add $16, %%"REG_S"                     \n\t"
1091                 "sub $16, %%"REG_D"                     \n\t"
1092                 "cmp $512, %%"REG_S"                    \n\t"
1093                 " jb 1b                                 \n\t"
1094                 :: "r" (buf), "r" (delay_ptr)
1095                 : "%"REG_S, "%"REG_D
1096         );
1097 }
1098 #endif // ARCH_X86 || ARCH_X86_64
1099
1100 void
1101 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1102 {
1103     int i,k;
1104     int p,q;
1105     int m;
1106     int two_m;
1107     int two_m_plus_one;
1108
1109     sample_t tmp_a_i;
1110     sample_t tmp_a_r;
1111     sample_t tmp_b_i;
1112     sample_t tmp_b_r;
1113
1114     sample_t *data_ptr;
1115     sample_t *delay_ptr;
1116     sample_t *window_ptr;
1117
1118     complex_t *buf_1, *buf_2;
1119
1120     buf_1 = &buf[0];
1121     buf_2 = &buf[64];
1122
1123     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1124     for(k=0; k<64; k++) {
1125         /* X1[k] = X[2*k]  */
1126         /* X2[k] = X[2*k+1]     */
1127
1128         p = 2 * (128-2*k-1);
1129         q = 2 * (2 * k);
1130
1131         /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1132         buf_1[k].real =         data[p] * xcos2[k] - data[q] * xsin2[k];
1133         buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1134         /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1135         buf_2[k].real =          data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1136         buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1137     }
1138
1139     /* IFFT Bit reversed shuffling */
1140     for(i=0; i<64; i++) {
1141         k = bit_reverse_256[i];
1142         if (k < i) {
1143             swap_cmplx(&buf_1[i],&buf_1[k]);
1144             swap_cmplx(&buf_2[i],&buf_2[k]);
1145         }
1146     }
1147
1148     /* FFT Merge */
1149     for (m=0; m < 6; m++) {
1150         two_m = (1 << m);
1151         two_m_plus_one = (1 << (m+1));
1152
1153         /* FIXME */
1154         if(m)
1155             two_m = (1 << m);
1156         else
1157             two_m = 1;
1158
1159         for(k = 0; k < two_m; k++) {
1160             for(i = 0; i < 64; i += two_m_plus_one) {
1161                 p = k + i;
1162                 q = p + two_m;
1163                 /* Do block 1 */
1164                 tmp_a_r = buf_1[p].real;
1165                 tmp_a_i = buf_1[p].imag;
1166                 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1167                 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1168                 buf_1[p].real = tmp_a_r + tmp_b_r;
1169                 buf_1[p].imag =  tmp_a_i + tmp_b_i;
1170                 buf_1[q].real = tmp_a_r - tmp_b_r;
1171                 buf_1[q].imag =  tmp_a_i - tmp_b_i;
1172
1173                 /* Do block 2 */
1174                 tmp_a_r = buf_2[p].real;
1175                 tmp_a_i = buf_2[p].imag;
1176                 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1177                 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1178                 buf_2[p].real = tmp_a_r + tmp_b_r;
1179                 buf_2[p].imag =  tmp_a_i + tmp_b_i;
1180                 buf_2[q].real = tmp_a_r - tmp_b_r;
1181                 buf_2[q].imag =  tmp_a_i - tmp_b_i;
1182             }
1183         }
1184     }
1185
1186     /* Post IFFT complex multiply */
1187     for( i=0; i < 64; i++) {
1188         /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1189         tmp_a_r =  buf_1[i].real;
1190         tmp_a_i = -buf_1[i].imag;
1191         buf_1[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1192         buf_1[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1193         /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1194         tmp_a_r =  buf_2[i].real;
1195         tmp_a_i = -buf_2[i].imag;
1196         buf_2[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1197         buf_2[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1198     }
1199
1200     data_ptr = data;
1201     delay_ptr = delay;
1202     window_ptr = imdct_window;
1203
1204     /* Window and convert to real valued signal */
1205     for(i=0; i< 64; i++) {
1206         *data_ptr++  = -buf_1[i].imag      * *window_ptr++ + *delay_ptr++ + bias;
1207         *data_ptr++  =  buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1208     }
1209
1210     for(i=0; i< 64; i++) {
1211         *data_ptr++  = -buf_1[i].real      * *window_ptr++ + *delay_ptr++ + bias;
1212         *data_ptr++  =  buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1213     }
1214
1215     delay_ptr = delay;
1216
1217     for(i=0; i< 64; i++) {
1218         *delay_ptr++ = -buf_2[i].real      * *--window_ptr;
1219         *delay_ptr++ =  buf_2[64-i-1].imag * *--window_ptr;
1220     }
1221
1222     for(i=0; i< 64; i++) {
1223         *delay_ptr++ =  buf_2[i].imag      * *--window_ptr;
1224         *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1225     }
1226 }
1227
1228 void imdct_init (uint32_t mm_accel)
1229 {
1230 #ifdef LIBA52_MLIB
1231     if (mm_accel & MM_ACCEL_MLIB) {
1232         fprintf (stderr, "Using mlib for IMDCT transform\n");
1233         imdct_512 = imdct_do_512_mlib;
1234         imdct_256 = imdct_do_256_mlib;
1235     } else
1236 #endif
1237     {
1238         int i, j, k;
1239
1240         /* Twiddle factors to turn IFFT into IMDCT */
1241         for (i = 0; i < 128; i++) {
1242             xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1243             xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1244         }
1245 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1246         for (i = 0; i < 128; i++) {
1247             sseSinCos1c[2*i+0]= xcos1[i];
1248             sseSinCos1c[2*i+1]= -xcos1[i];
1249             sseSinCos1d[2*i+0]= xsin1[i];
1250             sseSinCos1d[2*i+1]= xsin1[i];
1251         }
1252 #endif
1253
1254         /* More twiddle factors to turn IFFT into IMDCT */
1255         for (i = 0; i < 64; i++) {
1256             xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1257             xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1258         }
1259
1260         for (i = 0; i < 7; i++) {
1261             j = 1 << i;
1262             for (k = 0; k < j; k++) {
1263                 w[i][k].real = cos (-M_PI * k / j);
1264                 w[i][k].imag = sin (-M_PI * k / j);
1265             }
1266         }
1267 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1268         for (i = 1; i < 7; i++) {
1269             j = 1 << i;
1270             for (k = 0; k < j; k+=2) {
1271
1272                 sseW[i][4*k + 0] = w[i][k+0].real;
1273                 sseW[i][4*k + 1] = w[i][k+0].real;
1274                 sseW[i][4*k + 2] = w[i][k+1].real;
1275                 sseW[i][4*k + 3] = w[i][k+1].real;
1276
1277                 sseW[i][4*k + 4] = -w[i][k+0].imag;
1278                 sseW[i][4*k + 5] = w[i][k+0].imag;
1279                 sseW[i][4*k + 6] = -w[i][k+1].imag;
1280                 sseW[i][4*k + 7] = w[i][k+1].imag;
1281
1282         //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1283                 if(k==0)
1284                 {
1285 //                      sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1286                         sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1287                 }
1288
1289                 if(2*k == j)
1290                 {
1291                         sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1292 //                      sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1293                 }
1294             }
1295         }
1296
1297         for(i=0; i<128; i++)
1298         {
1299                 sseWindow[2*i+0]= -imdct_window[2*i+0];
1300                 sseWindow[2*i+1]=  imdct_window[2*i+1];
1301         }
1302
1303         for(i=0; i<64; i++)
1304         {
1305                 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1306                 sseWindow[256 + 2*i+1]=  imdct_window[254 - 2*i+0];
1307                 sseWindow[384 + 2*i+0]=  imdct_window[126 - 2*i+1];
1308                 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1309         }
1310 #endif // ARCH_X86 || ARCH_X86_64
1311
1312         imdct_512 = imdct_do_512;
1313 #if defined(ARCH_X86) || defined(ARCH_X86_64)
1314         if(mm_accel & MM_ACCEL_X86_SSE)
1315         {
1316           fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1317           imdct_512 = imdct_do_512_sse;
1318         }
1319         else
1320         if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1321         {
1322           fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1323           imdct_512 = imdct_do_512_3dnowex;
1324         }
1325         else
1326         if(mm_accel & MM_ACCEL_X86_3DNOW)
1327         {
1328           fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1329           imdct_512 = imdct_do_512_3dnow;
1330         }
1331         else
1332 #endif // ARCH_X86 || ARCH_X86_64
1333 #ifdef HAVE_ALTIVEC
1334         if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1335         {
1336           fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1337           imdct_512 = imdct_do_512_altivec;
1338         }
1339         else
1340 #endif
1341         fprintf (stderr, "No accelerated IMDCT transform found\n");
1342         imdct_256 = imdct_do_256;
1343     }
1344 }
1345
1346 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1347              const complex_t *d, const complex_t *d_3)
1348 {
1349   register complex_t  *x2k, *x3k, *x4k, *wB;
1350   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1351
1352   x2k = x + 2 * k;
1353   x3k = x2k + 2 * k;
1354   x4k = x3k + 2 * k;
1355   wB = wTB + 2 * k;
1356
1357   TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1358   TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1359
1360   --k;
1361   for(;;) {
1362      TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1363      TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1364      if (!--k) break;
1365      x += 2;
1366      x2k += 2;
1367      x3k += 2;
1368      x4k += 2;
1369      d += 2;
1370      d_3 += 2;
1371      wTB += 2;
1372      wB += 2;
1373   }
1374
1375 }
1376
1377 static void fft_asmb16(complex_t *x, complex_t *wTB)
1378 {
1379   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1380   int k = 2;
1381
1382   /* transform x[0], x[8], x[4], x[12] */
1383   TRANSZERO(x[0],x[4],x[8],x[12]);
1384
1385   /* transform x[1], x[9], x[5], x[13] */
1386   TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1387
1388   /* transform x[2], x[10], x[6], x[14] */
1389   TRANSHALF_16(x[2],x[6],x[10],x[14]);
1390
1391   /* transform x[3], x[11], x[7], x[15] */
1392   TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1393
1394 }
1395
1396 static void fft_4(complex_t *x)
1397 {
1398   /* delta_p = 1 here */
1399   /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1400    */
1401
1402   register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1403
1404   yt_r = x[0].real;
1405   yb_r = yt_r - x[2].real;
1406   yt_r += x[2].real;
1407
1408   u_r = x[1].real;
1409   vi_i = x[3].real - u_r;
1410   u_r += x[3].real;
1411
1412   u_i = x[1].imag;
1413   vi_r = u_i - x[3].imag;
1414   u_i += x[3].imag;
1415
1416   yt_i = yt_r;
1417   yt_i += u_r;
1418   x[0].real = yt_i;
1419   yt_r -= u_r;
1420   x[2].real = yt_r;
1421   yt_i = yb_r;
1422   yt_i += vi_r;
1423   x[1].real = yt_i;
1424   yb_r -= vi_r;
1425   x[3].real = yb_r;
1426
1427   yt_i = x[0].imag;
1428   yb_i = yt_i - x[2].imag;
1429   yt_i += x[2].imag;
1430
1431   yt_r = yt_i;
1432   yt_r += u_i;
1433   x[0].imag = yt_r;
1434   yt_i -= u_i;
1435   x[2].imag = yt_i;
1436   yt_r = yb_i;
1437   yt_r += vi_i;
1438   x[1].imag = yt_r;
1439   yb_i -= vi_i;
1440   x[3].imag = yb_i;
1441 }
1442
1443
1444 static void fft_8(complex_t *x)
1445 {
1446   /* delta_p = diag{1, sqrt(i)} here */
1447   /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1448    */
1449   register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1450
1451   wT1_r = x[1].real;
1452   wT1_i = x[1].imag;
1453   wB1_r = x[3].real;
1454   wB1_i = x[3].imag;
1455
1456   x[1] = x[2];
1457   x[2] = x[4];
1458   x[3] = x[6];
1459   fft_4(&x[0]);
1460
1461
1462   /* x[0] x[4] */
1463   wT2_r = x[5].real;
1464   wT2_r += x[7].real;
1465   wT2_r += wT1_r;
1466   wT2_r += wB1_r;
1467   wT2_i = wT2_r;
1468   wT2_r += x[0].real;
1469   wT2_i = x[0].real - wT2_i;
1470   x[0].real = wT2_r;
1471   x[4].real = wT2_i;
1472
1473   wT2_i = x[5].imag;
1474   wT2_i += x[7].imag;
1475   wT2_i += wT1_i;
1476   wT2_i += wB1_i;
1477   wT2_r = wT2_i;
1478   wT2_r += x[0].imag;
1479   wT2_i = x[0].imag - wT2_i;
1480   x[0].imag = wT2_r;
1481   x[4].imag = wT2_i;
1482
1483   /* x[2] x[6] */
1484   wT2_r = x[5].imag;
1485   wT2_r -= x[7].imag;
1486   wT2_r += wT1_i;
1487   wT2_r -= wB1_i;
1488   wT2_i = wT2_r;
1489   wT2_r += x[2].real;
1490   wT2_i = x[2].real - wT2_i;
1491   x[2].real = wT2_r;
1492   x[6].real = wT2_i;
1493
1494   wT2_i = x[5].real;
1495   wT2_i -= x[7].real;
1496   wT2_i += wT1_r;
1497   wT2_i -= wB1_r;
1498   wT2_r = wT2_i;
1499   wT2_r += x[2].imag;
1500   wT2_i = x[2].imag - wT2_i;
1501   x[2].imag = wT2_i;
1502   x[6].imag = wT2_r;
1503
1504
1505   /* x[1] x[5] */
1506   wT2_r = wT1_r;
1507   wT2_r += wB1_i;
1508   wT2_r -= x[5].real;
1509   wT2_r -= x[7].imag;
1510   wT2_i = wT1_i;
1511   wT2_i -= wB1_r;
1512   wT2_i -= x[5].imag;
1513   wT2_i += x[7].real;
1514
1515   wB2_r = wT2_r;
1516   wB2_r += wT2_i;
1517   wT2_i -= wT2_r;
1518   wB2_r *= HSQRT2;
1519   wT2_i *= HSQRT2;
1520   wT2_r = wB2_r;
1521   wB2_r += x[1].real;
1522   wT2_r =  x[1].real - wT2_r;
1523
1524   wB2_i = x[5].real;
1525   x[1].real = wB2_r;
1526   x[5].real = wT2_r;
1527
1528   wT2_r = wT2_i;
1529   wT2_r += x[1].imag;
1530   wT2_i = x[1].imag - wT2_i;
1531   wB2_r = x[5].imag;
1532   x[1].imag = wT2_r;
1533   x[5].imag = wT2_i;
1534
1535   /* x[3] x[7] */
1536   wT1_r -= wB1_i;
1537   wT1_i += wB1_r;
1538   wB1_r = wB2_i - x[7].imag;
1539   wB1_i = wB2_r + x[7].real;
1540   wT1_r -= wB1_r;
1541   wT1_i -= wB1_i;
1542   wB1_r = wT1_r + wT1_i;
1543   wB1_r *= HSQRT2;
1544   wT1_i -= wT1_r;
1545   wT1_i *= HSQRT2;
1546   wB2_r = x[3].real;
1547   wB2_i = wB2_r + wT1_i;
1548   wB2_r -= wT1_i;
1549   x[3].real = wB2_i;
1550   x[7].real = wB2_r;
1551   wB2_i = x[3].imag;
1552   wB2_r = wB2_i + wB1_r;
1553   wB2_i -= wB1_r;
1554   x[3].imag = wB2_i;
1555   x[7].imag = wB2_r;
1556 }
1557
1558
1559 static void fft_128p(complex_t *a)
1560 {
1561   fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1562   fft_asmb16(&a[0], &a[8]);
1563
1564   fft_8(&a[16]), fft_8(&a[24]);
1565   fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1566
1567   fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1568   fft_asmb16(&a[32], &a[40]);
1569
1570   fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1571   fft_asmb16(&a[48], &a[56]);
1572
1573   fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1574
1575   fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1576   /* fft_16(&a[64]); */
1577   fft_asmb16(&a[64], &a[72]);
1578
1579   fft_8(&a[80]); fft_8(&a[88]);
1580
1581   /* fft_32(&a[64]); */
1582   fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1583
1584   fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1585   /* fft_16(&a[96]); */
1586   fft_asmb16(&a[96], &a[104]);
1587
1588   fft_8(&a[112]), fft_8(&a[120]);
1589   /* fft_32(&a[96]); */
1590   fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1591
1592   /* fft_128(&a[0]); */
1593   fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
1594 }
1595
1596
1597