liba52/imdct.c

   1 /*
   2  * imdct.c
   3  * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
   4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   5  *
   6  * This file is part of a52dec, a free ATSC A-52 stream decoder.
   7  * See http://liba52.sourceforge.net/ for updates.
   8  *
   9  * a52dec is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * a52dec is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  22  *
  23  * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
  24  * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
  25  *   michael did port them from libac3 (untested, perhaps totally broken)
  26  * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
  27  */
  28
  29 #include "config.h"
  30
  31 #include <math.h>
  32 #include <stdio.h>
  33 #ifndef M_PI
  34 #define M_PI 3.1415926535897932384626433832795029
  35 #endif
  36 #include <inttypes.h>
  37
  38 #include "a52.h"
  39 #include "a52_internal.h"
  40 #include "mm_accel.h"
  41 #include "mangle.h"
  42
  43 #ifdef RUNTIME_CPUDETECT
  44 #undef HAVE_3DNOWEX
  45 #endif
  46
  47 #define USE_AC3_C
  48
  49 void (* imdct_256) (sample_t data[], sample_t delay[], sample_t bias);
  50 void (* imdct_512) (sample_t data[], sample_t delay[], sample_t bias);
  51
  52 typedef struct complex_s {
  53     sample_t real;
  54     sample_t imag;
  55 } complex_t;
  56
  57 static void fft_128p(complex_t *a);
  58
  59 static const int pm128[128] attribute_used __attribute__((aligned(16))) =
  60 {
  61         0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
  62         4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
  63         2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
  64         6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
  65         1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
  66         5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
  67         3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
  68         7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
  69 };
  70
  71 /* 128 point bit-reverse LUT */
  72 static uint8_t attribute_used bit_reverse_512[] = {
  73         0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
  74         0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
  75         0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
  76         0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
  77         0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
  78         0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
  79         0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
  80         0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
  81         0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
  82         0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
  83         0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
  84         0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
  85         0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
  86         0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
  87         0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
  88         0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
  89
  90 static uint8_t bit_reverse_256[] = {
  91         0x00, 0x20, 0x10, 0x30, 0x08, 0x28, 0x18, 0x38,
  92         0x04, 0x24, 0x14, 0x34, 0x0c, 0x2c, 0x1c, 0x3c,
  93         0x02, 0x22, 0x12, 0x32, 0x0a, 0x2a, 0x1a, 0x3a,
  94         0x06, 0x26, 0x16, 0x36, 0x0e, 0x2e, 0x1e, 0x3e,
  95         0x01, 0x21, 0x11, 0x31, 0x09, 0x29, 0x19, 0x39,
  96         0x05, 0x25, 0x15, 0x35, 0x0d, 0x2d, 0x1d, 0x3d,
  97         0x03, 0x23, 0x13, 0x33, 0x0b, 0x2b, 0x1b, 0x3b,
  98         0x07, 0x27, 0x17, 0x37, 0x0f, 0x2f, 0x1f, 0x3f};
  99
 100 #ifdef ARCH_X86
 101 // NOTE: SSE needs 16byte alignment or it will segfault
 102 //
 103 static complex_t __attribute__((aligned(16))) buf[128];
 104 static float __attribute__((aligned(16))) sseSinCos1c[256];
 105 static float __attribute__((aligned(16))) sseSinCos1d[256];
 106 static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
 107 //static float __attribute__((aligned(16))) sseW0[4];
 108 static float __attribute__((aligned(16))) sseW1[8];
 109 static float __attribute__((aligned(16))) sseW2[16];
 110 static float __attribute__((aligned(16))) sseW3[32];
 111 static float __attribute__((aligned(16))) sseW4[64];
 112 static float __attribute__((aligned(16))) sseW5[128];
 113 static float __attribute__((aligned(16))) sseW6[256];
 114 static float __attribute__((aligned(16))) *sseW[7]=
 115         {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 116 static float __attribute__((aligned(16))) sseWindow[512];
 117 #else
 118 static complex_t  __attribute__((aligned(16))) buf[128];
 119 #endif
 120
 121 /* Twiddle factor LUT */
 122 static complex_t __attribute__((aligned(16))) w_1[1];
 123 static complex_t __attribute__((aligned(16))) w_2[2];
 124 static complex_t __attribute__((aligned(16))) w_4[4];
 125 static complex_t __attribute__((aligned(16))) w_8[8];
 126 static complex_t __attribute__((aligned(16))) w_16[16];
 127 static complex_t __attribute__((aligned(16))) w_32[32];
 128 static complex_t __attribute__((aligned(16))) w_64[64];
 129 static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
 130
 131 /* Twiddle factors for IMDCT */
 132 static sample_t __attribute__((aligned(16))) xcos1[128];
 133 static sample_t __attribute__((aligned(16))) xsin1[128];
 134 static sample_t __attribute__((aligned(16))) xcos2[64];
 135 static sample_t __attribute__((aligned(16))) xsin2[64];
 136
 137 /* Windowing function for Modified DCT - Thank you acroread */
 138 sample_t imdct_window[] = {
 139         0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
 140         0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
 141         0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
 142         0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
 143         0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
 144         0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
 145         0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
 146         0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
 147         0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
 148         0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
 149         0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
 150         0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
 151         0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
 152         0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
 153         0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
 154         0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
 155         0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
 156         0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
 157         0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
 158         0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
 159         0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
 160         0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
 161         0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
 162         0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
 163         0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
 164         0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
 165         0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
 166         0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
 167         0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
 168         0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
 169         0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
 170         1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000 };
 171
 172
 173 static inline void swap_cmplx(complex_t *a, complex_t *b)
 174 {
 175     complex_t tmp;
 176
 177     tmp = *a;
 178     *a = *b;
 179     *b = tmp;
 180 }
 181
 182
 183
 184 static inline complex_t cmplx_mult(complex_t a, complex_t b)
 185 {
 186     complex_t ret;
 187
 188     ret.real = a.real * b.real - a.imag * b.imag;
 189     ret.imag = a.real * b.imag + a.imag * b.real;
 190
 191     return ret;
 192 }
 193
 194 void
 195 imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
 196 {
 197     int i;
 198 #ifndef USE_AC3_C
 199         int k;
 200     int p,q;
 201     int m;
 202     int two_m;
 203     int two_m_plus_one;
 204
 205     sample_t tmp_b_i;
 206     sample_t tmp_b_r;
 207 #endif
 208     sample_t tmp_a_i;
 209     sample_t tmp_a_r;
 210
 211     sample_t *data_ptr;
 212     sample_t *delay_ptr;
 213     sample_t *window_ptr;
 214
 215     /* 512 IMDCT with source and dest data in 'data' */
 216
 217     /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 218     for( i=0; i < 128; i++) {
 219         /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 220 #ifdef USE_AC3_C
 221         int j= pm128[i];
 222 #else
 223         int j= bit_reverse_512[i];
 224 #endif
 225         buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 226         buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 227     }
 228
 229     /* FFT Merge */
 230 /* unoptimized variant
 231     for (m=1; m < 7; m++) {
 232         if(m)
 233             two_m = (1 << m);
 234         else
 235             two_m = 1;
 236
 237         two_m_plus_one = (1 << (m+1));
 238
 239         for(i = 0; i < 128; i += two_m_plus_one) {
 240             for(k = 0; k < two_m; k++) {
 241                 p = k + i;
 242                 q = p + two_m;
 243                 tmp_a_r = buf[p].real;
 244                 tmp_a_i = buf[p].imag;
 245                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 246                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 247                 buf[p].real = tmp_a_r + tmp_b_r;
 248                 buf[p].imag =  tmp_a_i + tmp_b_i;
 249                 buf[q].real = tmp_a_r - tmp_b_r;
 250                 buf[q].imag =  tmp_a_i - tmp_b_i;
 251             }
 252         }
 253     }
 254 */
 255 #ifdef USE_AC3_C
 256         fft_128p (&buf[0]);
 257 #else
 258
 259     /* 1. iteration */
 260     for(i = 0; i < 128; i += 2) {
 261         tmp_a_r = buf[i].real;
 262         tmp_a_i = buf[i].imag;
 263         tmp_b_r = buf[i+1].real;
 264         tmp_b_i = buf[i+1].imag;
 265         buf[i].real = tmp_a_r + tmp_b_r;
 266         buf[i].imag =  tmp_a_i + tmp_b_i;
 267         buf[i+1].real = tmp_a_r - tmp_b_r;
 268         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 269     }
 270
 271     /* 2. iteration */
 272         // Note w[1]={{1,0}, {0,-1}}
 273     for(i = 0; i < 128; i += 4) {
 274         tmp_a_r = buf[i].real;
 275         tmp_a_i = buf[i].imag;
 276         tmp_b_r = buf[i+2].real;
 277         tmp_b_i = buf[i+2].imag;
 278         buf[i].real = tmp_a_r + tmp_b_r;
 279         buf[i].imag =  tmp_a_i + tmp_b_i;
 280         buf[i+2].real = tmp_a_r - tmp_b_r;
 281         buf[i+2].imag =  tmp_a_i - tmp_b_i;
 282         tmp_a_r = buf[i+1].real;
 283         tmp_a_i = buf[i+1].imag;
 284         tmp_b_r = buf[i+3].imag;
 285         tmp_b_i = buf[i+3].real;
 286         buf[i+1].real = tmp_a_r + tmp_b_r;
 287         buf[i+1].imag =  tmp_a_i - tmp_b_i;
 288         buf[i+3].real = tmp_a_r - tmp_b_r;
 289         buf[i+3].imag =  tmp_a_i + tmp_b_i;
 290     }
 291
 292     /* 3. iteration */
 293     for(i = 0; i < 128; i += 8) {
 294                 tmp_a_r = buf[i].real;
 295                 tmp_a_i = buf[i].imag;
 296                 tmp_b_r = buf[i+4].real;
 297                 tmp_b_i = buf[i+4].imag;
 298                 buf[i].real = tmp_a_r + tmp_b_r;
 299                 buf[i].imag =  tmp_a_i + tmp_b_i;
 300                 buf[i+4].real = tmp_a_r - tmp_b_r;
 301                 buf[i+4].imag =  tmp_a_i - tmp_b_i;
 302                 tmp_a_r = buf[1+i].real;
 303                 tmp_a_i = buf[1+i].imag;
 304                 tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 305                 tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 306                 buf[1+i].real = tmp_a_r + tmp_b_r;
 307                 buf[1+i].imag =  tmp_a_i + tmp_b_i;
 308                 buf[i+5].real = tmp_a_r - tmp_b_r;
 309                 buf[i+5].imag =  tmp_a_i - tmp_b_i;
 310                 tmp_a_r = buf[i+2].real;
 311                 tmp_a_i = buf[i+2].imag;
 312                 tmp_b_r = buf[i+6].imag;
 313                 tmp_b_i = - buf[i+6].real;
 314                 buf[i+2].real = tmp_a_r + tmp_b_r;
 315                 buf[i+2].imag =  tmp_a_i + tmp_b_i;
 316                 buf[i+6].real = tmp_a_r - tmp_b_r;
 317                 buf[i+6].imag =  tmp_a_i - tmp_b_i;
 318                 tmp_a_r = buf[i+3].real;
 319                 tmp_a_i = buf[i+3].imag;
 320                 tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 321                 tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 322                 buf[i+3].real = tmp_a_r + tmp_b_r;
 323                 buf[i+3].imag =  tmp_a_i + tmp_b_i;
 324                 buf[i+7].real = tmp_a_r - tmp_b_r;
 325                 buf[i+7].imag =  tmp_a_i - tmp_b_i;
 326      }
 327
 328     /* 4-7. iterations */
 329     for (m=3; m < 7; m++) {
 330         two_m = (1 << m);
 331
 332         two_m_plus_one = two_m<<1;
 333
 334         for(i = 0; i < 128; i += two_m_plus_one) {
 335             for(k = 0; k < two_m; k++) {
 336                 int p = k + i;
 337                 int q = p + two_m;
 338                 tmp_a_r = buf[p].real;
 339                 tmp_a_i = buf[p].imag;
 340                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 341                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 342                 buf[p].real = tmp_a_r + tmp_b_r;
 343                 buf[p].imag =  tmp_a_i + tmp_b_i;
 344                 buf[q].real = tmp_a_r - tmp_b_r;
 345                 buf[q].imag =  tmp_a_i - tmp_b_i;
 346             }
 347         }
 348     }
 349 #endif
 350     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 351     for( i=0; i < 128; i++) {
 352         /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 353         tmp_a_r =        buf[i].real;
 354         tmp_a_i = -1.0 * buf[i].imag;
 355         buf[i].real =(tmp_a_r * xcos1[i])  -  (tmp_a_i  * xsin1[i]);
 356         buf[i].imag =(tmp_a_r * xsin1[i])  +  (tmp_a_i  * xcos1[i]);
 357     }
 358
 359     data_ptr = data;
 360     delay_ptr = delay;
 361     window_ptr = imdct_window;
 362
 363     /* Window and convert to real valued signal */
 364     for(i=0; i< 64; i++) {
 365         *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 366         *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 367     }
 368
 369     for(i=0; i< 64; i++) {
 370         *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 371         *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 372     }
 373
 374     /* The trailing edge of the window goes into the delay line */
 375     delay_ptr = delay;
 376
 377     for(i=0; i< 64; i++) {
 378         *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 379         *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 380     }
 381
 382     for(i=0; i<64; i++) {
 383         *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 384         *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 385     }
 386 }
 387
 388 #ifdef HAVE_ALTIVEC
 389
 390 #ifndef SYS_DARWIN
 391 #include <altivec.h>
 392 #endif
 393
 394 // used to build registers permutation vectors (vcprm)
 395 // the 's' are for words in the _s_econd vector
 396 #define WORD_0 0x00,0x01,0x02,0x03
 397 #define WORD_1 0x04,0x05,0x06,0x07
 398 #define WORD_2 0x08,0x09,0x0a,0x0b
 399 #define WORD_3 0x0c,0x0d,0x0e,0x0f
 400 #define WORD_s0 0x10,0x11,0x12,0x13
 401 #define WORD_s1 0x14,0x15,0x16,0x17
 402 #define WORD_s2 0x18,0x19,0x1a,0x1b
 403 #define WORD_s3 0x1c,0x1d,0x1e,0x1f
 404
 405 #ifdef SYS_DARWIN
 406 #define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
 407 #else
 408 #define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
 409 #endif
 410
 411 // vcprmle is used to keep the same index as in the SSE version.
 412 // it's the same as vcprm, with the index inversed
 413 // ('le' is Little Endian)
 414 #define vcprmle(a,b,c,d) vcprm(d,c,b,a)
 415
 416 // used to build inverse/identity vectors (vcii)
 417 // n is _n_egative, p is _p_ositive
 418 #define FLOAT_n -1.
 419 #define FLOAT_p 1.
 420
 421 #ifdef SYS_DARWIN
 422 #define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
 423 #else
 424 #define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
 425 #endif
 426
 427 #ifdef SYS_DARWIN
 428 #define FOUROF(a) (a)
 429 #else
 430 #define FOUROF(a) {a,a,a,a}
 431 #endif
 432
 433
 434 void
 435 imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
 436 {
 437   int i;
 438   int k;
 439   int p,q;
 440   int m;
 441   int two_m;
 442   int two_m_plus_one;
 443
 444   sample_t tmp_b_i;
 445   sample_t tmp_b_r;
 446   sample_t tmp_a_i;
 447   sample_t tmp_a_r;
 448
 449   sample_t *data_ptr;
 450   sample_t *delay_ptr;
 451   sample_t *window_ptr;
 452
 453   /* 512 IMDCT with source and dest data in 'data' */
 454
 455   /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
 456   for( i=0; i < 128; i++) {
 457     /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
 458     int j= bit_reverse_512[i];
 459     buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
 460     buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
 461   }
 462
 463   /* 1. iteration */
 464   for(i = 0; i < 128; i += 2) {
 465 #if 0
 466     tmp_a_r = buf[i].real;
 467     tmp_a_i = buf[i].imag;
 468     tmp_b_r = buf[i+1].real;
 469     tmp_b_i = buf[i+1].imag;
 470     buf[i].real = tmp_a_r + tmp_b_r;
 471     buf[i].imag =  tmp_a_i + tmp_b_i;
 472     buf[i+1].real = tmp_a_r - tmp_b_r;
 473     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 474 #else
 475     vector float temp, bufv;
 476
 477     bufv = vec_ld(i << 3, (float*)buf);
 478     temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
 479     bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
 480     vec_st(bufv, i << 3, (float*)buf);
 481 #endif
 482   }
 483
 484   /* 2. iteration */
 485   // Note w[1]={{1,0}, {0,-1}}
 486   for(i = 0; i < 128; i += 4) {
 487 #if 0
 488     tmp_a_r = buf[i].real;
 489     tmp_a_i = buf[i].imag;
 490     tmp_b_r = buf[i+2].real;
 491     tmp_b_i = buf[i+2].imag;
 492     buf[i].real = tmp_a_r + tmp_b_r;
 493     buf[i].imag =  tmp_a_i + tmp_b_i;
 494     buf[i+2].real = tmp_a_r - tmp_b_r;
 495     buf[i+2].imag =  tmp_a_i - tmp_b_i;
 496     tmp_a_r = buf[i+1].real;
 497     tmp_a_i = buf[i+1].imag;
 498     /* WARNING: im <-> re here ! */
 499     tmp_b_r = buf[i+3].imag;
 500     tmp_b_i = buf[i+3].real;
 501     buf[i+1].real = tmp_a_r + tmp_b_r;
 502     buf[i+1].imag =  tmp_a_i - tmp_b_i;
 503     buf[i+3].real = tmp_a_r - tmp_b_r;
 504     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 505 #else
 506     vector float buf01, buf23, temp1, temp2;
 507
 508     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 509     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 510     buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
 511
 512     temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
 513     temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
 514
 515     vec_st(temp1, (i + 0) << 3, (float*)buf);
 516     vec_st(temp2, (i + 2) << 3, (float*)buf);
 517 #endif
 518   }
 519
 520   /* 3. iteration */
 521   for(i = 0; i < 128; i += 8) {
 522 #if 0
 523     tmp_a_r = buf[i].real;
 524     tmp_a_i = buf[i].imag;
 525     tmp_b_r = buf[i+4].real;
 526     tmp_b_i = buf[i+4].imag;
 527     buf[i].real = tmp_a_r + tmp_b_r;
 528     buf[i].imag =  tmp_a_i + tmp_b_i;
 529     buf[i+4].real = tmp_a_r - tmp_b_r;
 530     buf[i+4].imag =  tmp_a_i - tmp_b_i;
 531     tmp_a_r = buf[1+i].real;
 532     tmp_a_i = buf[1+i].imag;
 533     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 534     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 535     buf[1+i].real = tmp_a_r + tmp_b_r;
 536     buf[1+i].imag =  tmp_a_i + tmp_b_i;
 537     buf[i+5].real = tmp_a_r - tmp_b_r;
 538     buf[i+5].imag =  tmp_a_i - tmp_b_i;
 539     tmp_a_r = buf[i+2].real;
 540     tmp_a_i = buf[i+2].imag;
 541     /* WARNING re <-> im & sign */
 542     tmp_b_r = buf[i+6].imag;
 543     tmp_b_i = - buf[i+6].real;
 544     buf[i+2].real = tmp_a_r + tmp_b_r;
 545     buf[i+2].imag =  tmp_a_i + tmp_b_i;
 546     buf[i+6].real = tmp_a_r - tmp_b_r;
 547     buf[i+6].imag =  tmp_a_i - tmp_b_i;
 548     tmp_a_r = buf[i+3].real;
 549     tmp_a_i = buf[i+3].imag;
 550     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 551     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 552     buf[i+3].real = tmp_a_r + tmp_b_r;
 553     buf[i+3].imag =  tmp_a_i + tmp_b_i;
 554     buf[i+7].real = tmp_a_r - tmp_b_r;
 555     buf[i+7].imag =  tmp_a_i - tmp_b_i;
 556 #else
 557     vector float buf01, buf23, buf45, buf67;
 558
 559     buf01 = vec_ld((i + 0) << 3, (float*)buf);
 560     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 561
 562     tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
 563     tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
 564     buf[i+5].real = tmp_b_r;
 565     buf[i+5].imag = tmp_b_i;
 566     tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
 567     tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
 568     buf[i+7].real = tmp_b_r;
 569     buf[i+7].imag = tmp_b_i;
 570
 571     buf23 = vec_ld((i + 2) << 3, (float*)buf);
 572     buf45 = vec_ld((i + 4) << 3, (float*)buf);
 573     buf67 = vec_ld((i + 6) << 3, (float*)buf);
 574     buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
 575
 576     vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
 577     vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
 578     vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
 579     vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
 580 #endif
 581   }
 582
 583   /* 4-7. iterations */
 584   for (m=3; m < 7; m++) {
 585     two_m = (1 << m);
 586
 587     two_m_plus_one = two_m<<1;
 588
 589     for(i = 0; i < 128; i += two_m_plus_one) {
 590       for(k = 0; k < two_m; k+=2) {
 591 #if 0
 592         int p = k + i;
 593         int q = p + two_m;
 594         tmp_a_r = buf[p].real;
 595         tmp_a_i = buf[p].imag;
 596         tmp_b_r =
 597           buf[q].real * w[m][k].real -
 598           buf[q].imag * w[m][k].imag;
 599         tmp_b_i =
 600           buf[q].imag * w[m][k].real +
 601           buf[q].real * w[m][k].imag;
 602         buf[p].real = tmp_a_r + tmp_b_r;
 603         buf[p].imag =  tmp_a_i + tmp_b_i;
 604         buf[q].real = tmp_a_r - tmp_b_r;
 605         buf[q].imag =  tmp_a_i - tmp_b_i;
 606
 607         tmp_a_r = buf[(p + 1)].real;
 608         tmp_a_i = buf[(p + 1)].imag;
 609         tmp_b_r =
 610           buf[(q + 1)].real * w[m][(k + 1)].real -
 611           buf[(q + 1)].imag * w[m][(k + 1)].imag;
 612         tmp_b_i =
 613           buf[(q + 1)].imag * w[m][(k + 1)].real +
 614           buf[(q + 1)].real * w[m][(k + 1)].imag;
 615         buf[(p + 1)].real = tmp_a_r + tmp_b_r;
 616         buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
 617         buf[(q + 1)].real = tmp_a_r - tmp_b_r;
 618         buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
 619 #else
 620         int p = k + i;
 621         int q = p + two_m;
 622         vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
 623         const vector float vczero = (const vector float)FOUROF(0.);
 624         // first compute buf[q] and buf[q+1]
 625         vecq = vec_ld(q << 3, (float*)buf);
 626         vecw = vec_ld(0, (float*)&(w[m][k]));
 627         temp1 = vec_madd(vecq, vecw, vczero);
 628         temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
 629         temp2 = vec_madd(temp2, vecw, vczero);
 630         temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
 631         temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
 632         vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
 633         // then butterfly with buf[p] and buf[p+1]
 634         vecp = vec_ld(p << 3, (float*)buf);
 635
 636         temp1 = vec_add(vecp, vecq);
 637         temp2 = vec_sub(vecp, vecq);
 638
 639         vec_st(temp1, p << 3, (float*)buf);
 640         vec_st(temp2, q << 3, (float*)buf);
 641 #endif
 642       }
 643     }
 644   }
 645
 646   /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 647   for( i=0; i < 128; i+=4) {
 648     /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 649 #if 0
 650     tmp_a_r =        buf[(i + 0)].real;
 651     tmp_a_i = -1.0 * buf[(i + 0)].imag;
 652     buf[(i + 0)].real =
 653       (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
 654     buf[(i + 0)].imag =
 655       (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
 656
 657     tmp_a_r =        buf[(i + 1)].real;
 658     tmp_a_i = -1.0 * buf[(i + 1)].imag;
 659     buf[(i + 1)].real =
 660       (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
 661     buf[(i + 1)].imag =
 662       (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
 663
 664     tmp_a_r =        buf[(i + 2)].real;
 665     tmp_a_i = -1.0 * buf[(i + 2)].imag;
 666     buf[(i + 2)].real =
 667       (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
 668     buf[(i + 2)].imag =
 669       (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
 670
 671     tmp_a_r =        buf[(i + 3)].real;
 672     tmp_a_i = -1.0 * buf[(i + 3)].imag;
 673     buf[(i + 3)].real =
 674       (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
 675     buf[(i + 3)].imag =
 676       (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
 677 #else
 678     vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
 679     vector float temp0022, temp1133, tempCS01;
 680     const vector float vczero = (const vector float)FOUROF(0.);
 681
 682     bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
 683     bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
 684
 685     cosv = vec_ld(i << 2, xcos1);
 686     sinv = vec_ld(i << 2, xsin1);
 687
 688     temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
 689     temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
 690     tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
 691     temp1 = vec_madd(temp0022, tempCS01, vczero);
 692     tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
 693     temp2 = vec_madd(temp1133, tempCS01, vczero);
 694     bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 695
 696     vec_st(bufv_0, (i + 0) << 3, (float*)buf);
 697
 698     /* idem with bufv_2 and high-order cosv/sinv */
 699
 700     temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
 701     temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
 702     tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
 703     temp1 = vec_madd(temp0022, tempCS01, vczero);
 704     tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
 705     temp2 = vec_madd(temp1133, tempCS01, vczero);
 706     bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
 707
 708     vec_st(bufv_2, (i + 2) << 3, (float*)buf);
 709
 710 #endif
 711   }
 712
 713   data_ptr = data;
 714   delay_ptr = delay;
 715   window_ptr = imdct_window;
 716
 717   /* Window and convert to real valued signal */
 718   for(i=0; i< 64; i++) {
 719     *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
 720     *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
 721   }
 722
 723   for(i=0; i< 64; i++) {
 724     *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
 725     *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
 726   }
 727
 728   /* The trailing edge of the window goes into the delay line */
 729   delay_ptr = delay;
 730
 731   for(i=0; i< 64; i++) {
 732     *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
 733     *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
 734   }
 735
 736   for(i=0; i<64; i++) {
 737     *delay_ptr++  =  buf[i].imag       * *--window_ptr;
 738     *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
 739   }
 740 }
 741 #endif
 742
 743
 744 // Stuff below this line is borrowed from libac3
 745 #include "srfftp.h"
 746 #ifdef ARCH_X86
 747 #ifndef HAVE_3DNOW
 748 #define HAVE_3DNOW 1
 749 #endif
 750 #include "srfftp_3dnow.h"
 751
 752 const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
 753 const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
 754 const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
 755
 756 #undef HAVE_3DNOWEX
 757 #include "imdct_3dnow.h"
 758 #define HAVE_3DNOWEX
 759 #include "imdct_3dnow.h"
 760
 761 void
 762 imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
 763 {
 764 /*      int i,k;
 765     int p,q;*/
 766     int m;
 767     int two_m;
 768     int two_m_plus_one;
 769
 770 /*  sample_t tmp_a_i;
 771     sample_t tmp_a_r;
 772     sample_t tmp_b_i;
 773     sample_t tmp_b_r;*/
 774
 775     sample_t *data_ptr;
 776     sample_t *delay_ptr;
 777     sample_t *window_ptr;
 778
 779     /* 512 IMDCT with source and dest data in 'data' */
 780     /* see the c version (dct_do_512()), its allmost identical, just in C */
 781
 782     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 783     /* Bit reversed shuffling */
 784         asm volatile(
 785                 "xorl %%esi, %%esi                      \n\t"
 786                 "leal "MANGLE(bit_reverse_512)", %%eax  \n\t"
 787                 "movl $1008, %%edi                      \n\t"
 788                 "pushl %%ebp                            \n\t" //use ebp without telling gcc
 789                 ".balign 16                             \n\t"
 790                 "1:                                     \n\t"
 791                 "movlps (%0, %%esi), %%xmm0             \n\t" // XXXI
 792                 "movhps 8(%0, %%edi), %%xmm0            \n\t" // RXXI
 793                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // XXXi
 794                 "movhps (%0, %%edi), %%xmm1             \n\t" // rXXi
 795                 "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
 796                 "movaps "MANGLE(sseSinCos1c)"(%%esi), %%xmm2\n\t"
 797                 "mulps %%xmm0, %%xmm2                   \n\t"
 798                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
 799                 "mulps "MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
 800                 "subps %%xmm0, %%xmm2                   \n\t"
 801                 "movzbl (%%eax), %%edx                  \n\t"
 802                 "movzbl 1(%%eax), %%ebp                 \n\t"
 803                 "movlps %%xmm2, (%1, %%edx,8)           \n\t"
 804                 "movhps %%xmm2, (%1, %%ebp,8)           \n\t"
 805                 "addl $16, %%esi                        \n\t"
 806                 "addl $2, %%eax                         \n\t" // avoid complex addressing for P4 crap
 807                 "subl $16, %%edi                        \n\t"
 808                 " jnc 1b                                \n\t"
 809                 "popl %%ebp                             \n\t"//no we didnt touch ebp *g*
 810                 :: "b" (data), "c" (buf)
 811                 : "%esi", "%edi", "%eax", "%edx"
 812         );
 813
 814
 815     /* FFT Merge */
 816 /* unoptimized variant
 817     for (m=1; m < 7; m++) {
 818         if(m)
 819             two_m = (1 << m);
 820         else
 821             two_m = 1;
 822
 823         two_m_plus_one = (1 << (m+1));
 824
 825         for(i = 0; i < 128; i += two_m_plus_one) {
 826             for(k = 0; k < two_m; k++) {
 827                 p = k + i;
 828                 q = p + two_m;
 829                 tmp_a_r = buf[p].real;
 830                 tmp_a_i = buf[p].imag;
 831                 tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
 832                 tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
 833                 buf[p].real = tmp_a_r + tmp_b_r;
 834                 buf[p].imag =  tmp_a_i + tmp_b_i;
 835                 buf[q].real = tmp_a_r - tmp_b_r;
 836                 buf[q].imag =  tmp_a_i - tmp_b_i;
 837             }
 838         }
 839     }
 840 */
 841
 842     /* 1. iteration */
 843         // Note w[0][0]={1,0}
 844         asm volatile(
 845                 "xorps %%xmm1, %%xmm1   \n\t"
 846                 "xorps %%xmm2, %%xmm2   \n\t"
 847                 "movl %0, %%esi         \n\t"
 848                 ".balign 16                             \n\t"
 849                 "1:                     \n\t"
 850                 "movlps (%%esi), %%xmm0 \n\t" //buf[p]
 851                 "movlps 8(%%esi), %%xmm1\n\t" //buf[q]
 852                 "movhps (%%esi), %%xmm0 \n\t" //buf[p]
 853                 "movhps 8(%%esi), %%xmm2\n\t" //buf[q]
 854                 "addps %%xmm1, %%xmm0   \n\t"
 855                 "subps %%xmm2, %%xmm0   \n\t"
 856                 "movaps %%xmm0, (%%esi) \n\t"
 857                 "addl $16, %%esi        \n\t"
 858                 "cmpl %1, %%esi         \n\t"
 859                 " jb 1b                 \n\t"
 860                 :: "g" (buf), "r" (buf + 128)
 861                 : "%esi"
 862         );
 863
 864     /* 2. iteration */
 865         // Note w[1]={{1,0}, {0,-1}}
 866         asm volatile(
 867                 "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
 868                 "movl %0, %%esi                 \n\t"
 869                 ".balign 16                             \n\t"
 870                 "1:                             \n\t"
 871                 "movaps 16(%%esi), %%xmm2       \n\t" //r2,i2,r3,i3
 872                 "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
 873                 "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
 874                 "movaps (%%esi), %%xmm0         \n\t" //r0,i0,r1,i1
 875                 "movaps (%%esi), %%xmm1         \n\t" //r0,i0,r1,i1
 876                 "addps %%xmm2, %%xmm0           \n\t"
 877                 "subps %%xmm2, %%xmm1           \n\t"
 878                 "movaps %%xmm0, (%%esi)         \n\t"
 879                 "movaps %%xmm1, 16(%%esi)       \n\t"
 880                 "addl $32, %%esi        \n\t"
 881                 "cmpl %1, %%esi         \n\t"
 882                 " jb 1b                 \n\t"
 883                 :: "g" (buf), "r" (buf + 128)
 884                 : "%esi"
 885         );
 886
 887     /* 3. iteration */
 888 /*
 889  Note sseW2+0={1,1,sqrt(2),sqrt(2))
 890  Note sseW2+16={0,0,sqrt(2),-sqrt(2))
 891  Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
 892  Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
 893 */
 894         asm volatile(
 895                 "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
 896                 "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
 897                 "xorps %%xmm5, %%xmm5           \n\t"
 898                 "xorps %%xmm2, %%xmm2           \n\t"
 899                 "movl %0, %%esi                 \n\t"
 900                 ".balign 16                     \n\t"
 901                 "1:                             \n\t"
 902                 "movaps 32(%%esi), %%xmm2       \n\t" //r4,i4,r5,i5
 903                 "movaps 48(%%esi), %%xmm3       \n\t" //r6,i6,r7,i7
 904                 "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
 905                 "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
 906                 "mulps %%xmm2, %%xmm4           \n\t"
 907                 "mulps %%xmm3, %%xmm5           \n\t"
 908                 "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
 909                 "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
 910                 "mulps %%xmm6, %%xmm3           \n\t"
 911                 "mulps %%xmm7, %%xmm2           \n\t"
 912                 "movaps (%%esi), %%xmm0         \n\t" //r0,i0,r1,i1
 913                 "movaps 16(%%esi), %%xmm1       \n\t" //r2,i2,r3,i3
 914                 "addps %%xmm4, %%xmm2           \n\t"
 915                 "addps %%xmm5, %%xmm3           \n\t"
 916                 "movaps %%xmm2, %%xmm4          \n\t"
 917                 "movaps %%xmm3, %%xmm5          \n\t"
 918                 "addps %%xmm0, %%xmm2           \n\t"
 919                 "addps %%xmm1, %%xmm3           \n\t"
 920                 "subps %%xmm4, %%xmm0           \n\t"
 921                 "subps %%xmm5, %%xmm1           \n\t"
 922                 "movaps %%xmm2, (%%esi)         \n\t"
 923                 "movaps %%xmm3, 16(%%esi)       \n\t"
 924                 "movaps %%xmm0, 32(%%esi)       \n\t"
 925                 "movaps %%xmm1, 48(%%esi)       \n\t"
 926                 "addl $64, %%esi        \n\t"
 927                 "cmpl %1, %%esi         \n\t"
 928                 " jb 1b                 \n\t"
 929                 :: "g" (buf), "r" (buf + 128)
 930                 : "%esi"
 931         );
 932
 933     /* 4-7. iterations */
 934     for (m=3; m < 7; m++) {
 935         two_m = (1 << m);
 936         two_m_plus_one = two_m<<1;
 937         asm volatile(
 938                 "movl %0, %%esi                         \n\t"
 939                 ".balign 16                             \n\t"
 940                 "1:                                     \n\t"
 941                 "xorl %%edi, %%edi                      \n\t" // k
 942                 "leal (%%esi, %3), %%edx                \n\t"
 943                 "2:                                     \n\t"
 944                 "movaps (%%edx, %%edi), %%xmm1          \n\t"
 945                 "movaps (%4, %%edi, 2), %%xmm2          \n\t"
 946                 "mulps %%xmm1, %%xmm2                   \n\t"
 947                 "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
 948                 "mulps 16(%4, %%edi, 2), %%xmm1         \n\t"
 949                 "movaps (%%esi, %%edi), %%xmm0          \n\t"
 950                 "addps %%xmm2, %%xmm1                   \n\t"
 951                 "movaps %%xmm1, %%xmm2                  \n\t"
 952                 "addps %%xmm0, %%xmm1                   \n\t"
 953                 "subps %%xmm2, %%xmm0                   \n\t"
 954                 "movaps %%xmm1, (%%esi, %%edi)          \n\t"
 955                 "movaps %%xmm0, (%%edx, %%edi)          \n\t"
 956                 "addl $16, %%edi                        \n\t"
 957                 "cmpl %3, %%edi                         \n\t" //FIXME (opt) count against 0
 958                 " jb 2b                                 \n\t"
 959                 "addl %2, %%esi                         \n\t"
 960                 "cmpl %1, %%esi                         \n\t"
 961                 " jb 1b                                 \n\t"
 962                 :: "g" (buf), "m" (buf+128), "m" (two_m_plus_one<<3), "r" (two_m<<3),
 963                    "r" (sseW[m])
 964                 : "%esi", "%edi", "%edx"
 965         );
 966     }
 967
 968     /* Post IFFT complex multiply  plus IFFT complex conjugate*/
 969         asm volatile(
 970                 "movl $-1024, %%esi                     \n\t"
 971                 ".balign 16                             \n\t"
 972                 "1:                                     \n\t"
 973                 "movaps (%0, %%esi), %%xmm0             \n\t"
 974                 "movaps (%0, %%esi), %%xmm1             \n\t"
 975                 "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
 976                 "mulps 1024+"MANGLE(sseSinCos1c)"(%%esi), %%xmm1\n\t"
 977                 "mulps 1024+"MANGLE(sseSinCos1d)"(%%esi), %%xmm0\n\t"
 978                 "addps %%xmm1, %%xmm0                   \n\t"
 979                 "movaps %%xmm0, (%0, %%esi)             \n\t"
 980                 "addl $16, %%esi                        \n\t"
 981                 " jnz 1b                                \n\t"
 982                 :: "r" (buf+128)
 983                 : "%esi"
 984         );
 985
 986
 987     data_ptr = data;
 988     delay_ptr = delay;
 989     window_ptr = imdct_window;
 990
 991     /* Window and convert to real valued signal */
 992         asm volatile(
 993                 "xorl %%edi, %%edi                      \n\t"  // 0
 994                 "xorl %%esi, %%esi                      \n\t"  // 0
 995                 "movss %3, %%xmm2                       \n\t"  // bias
 996                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
 997                 ".balign 16                             \n\t"
 998                 "1:                                     \n\t"
 999                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? A ?
1000                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? C ?
1001                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // ? D C ?
1002                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // ? B A ?
1003                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1004                 "mulps "MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1005                 "addps (%2, %%esi), %%xmm0              \n\t"
1006                 "addps %%xmm2, %%xmm0                   \n\t"
1007                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1008                 "addl $16, %%esi                        \n\t"
1009                 "subl $16, %%edi                        \n\t"
1010                 "cmpl $512, %%esi                       \n\t"
1011                 " jb 1b                                 \n\t"
1012                 :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1013                 : "%esi", "%edi"
1014         );
1015         data_ptr+=128;
1016         delay_ptr+=128;
1017 //      window_ptr+=128;
1018
1019         asm volatile(
1020                 "movl $1024, %%edi                      \n\t"  // 512
1021                 "xorl %%esi, %%esi                      \n\t"  // 0
1022                 "movss %3, %%xmm2                       \n\t"  // bias
1023                 "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
1024                 ".balign 16                             \n\t"
1025                 "1:                                     \n\t"
1026                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? ? A
1027                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? ? C
1028                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // D ? ? C
1029                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // B ? ? A
1030                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1031                 "mulps 512+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1032                 "addps (%2, %%esi), %%xmm0              \n\t"
1033                 "addps %%xmm2, %%xmm0                   \n\t"
1034                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1035                 "addl $16, %%esi                        \n\t"
1036                 "subl $16, %%edi                        \n\t"
1037                 "cmpl $512, %%esi                       \n\t"
1038                 " jb 1b                                 \n\t"
1039                 :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
1040                 : "%esi", "%edi"
1041         );
1042         data_ptr+=128;
1043 //      window_ptr+=128;
1044
1045     /* The trailing edge of the window goes into the delay line */
1046     delay_ptr = delay;
1047
1048         asm volatile(
1049                 "xorl %%edi, %%edi                      \n\t"  // 0
1050                 "xorl %%esi, %%esi                      \n\t"  // 0
1051                 ".balign 16                             \n\t"
1052                 "1:                                     \n\t"
1053                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? ? A
1054                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? ? C
1055                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // D ? ? C
1056                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // B ? ? A
1057                 "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
1058                 "mulps 1024+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1059                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1060                 "addl $16, %%esi                        \n\t"
1061                 "subl $16, %%edi                        \n\t"
1062                 "cmpl $512, %%esi                       \n\t"
1063                 " jb 1b                                 \n\t"
1064                 :: "r" (buf+64), "r" (delay_ptr)
1065                 : "%esi", "%edi"
1066         );
1067         delay_ptr+=128;
1068 //      window_ptr-=128;
1069
1070         asm volatile(
1071                 "movl $1024, %%edi                      \n\t"  // 1024
1072                 "xorl %%esi, %%esi                      \n\t"  // 0
1073                 ".balign 16                             \n\t"
1074                 "1:                                     \n\t"
1075                 "movlps (%0, %%esi), %%xmm0             \n\t" // ? ? A ?
1076                 "movlps 8(%0, %%esi), %%xmm1            \n\t" // ? ? C ?
1077                 "movhps -16(%0, %%edi), %%xmm1          \n\t" // ? D C ?
1078                 "movhps -8(%0, %%edi), %%xmm0           \n\t" // ? B A ?
1079                 "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
1080                 "mulps 1536+"MANGLE(sseWindow)"(%%esi), %%xmm0\n\t"
1081                 "movaps %%xmm0, (%1, %%esi)             \n\t"
1082                 "addl $16, %%esi                        \n\t"
1083                 "subl $16, %%edi                        \n\t"
1084                 "cmpl $512, %%esi                       \n\t"
1085                 " jb 1b                                 \n\t"
1086                 :: "r" (buf), "r" (delay_ptr)
1087                 : "%esi", "%edi"
1088         );
1089 }
1090 #endif //arch_x86
1091
1092 void
1093 imdct_do_256(sample_t data[],sample_t delay[],sample_t bias)
1094 {
1095     int i,k;
1096     int p,q;
1097     int m;
1098     int two_m;
1099     int two_m_plus_one;
1100
1101     sample_t tmp_a_i;
1102     sample_t tmp_a_r;
1103     sample_t tmp_b_i;
1104     sample_t tmp_b_r;
1105
1106     sample_t *data_ptr;
1107     sample_t *delay_ptr;
1108     sample_t *window_ptr;
1109
1110     complex_t *buf_1, *buf_2;
1111
1112     buf_1 = &buf[0];
1113     buf_2 = &buf[64];
1114
1115     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1116     for(k=0; k<64; k++) {
1117         /* X1[k] = X[2*k]  */
1118         /* X2[k] = X[2*k+1]     */
1119
1120         p = 2 * (128-2*k-1);
1121         q = 2 * (2 * k);
1122
1123         /* Z1[k] = (X1[128-2*k-1] + j * X1[2*k]) * (xcos2[k] + j * xsin2[k]); */
1124         buf_1[k].real =         data[p] * xcos2[k] - data[q] * xsin2[k];
1125         buf_1[k].imag = -1.0f * (data[q] * xcos2[k] + data[p] * xsin2[k]);
1126         /* Z2[k] = (X2[128-2*k-1] + j * X2[2*k]) * (xcos2[k] + j * xsin2[k]); */
1127         buf_2[k].real =          data[p + 1] * xcos2[k] - data[q + 1] * xsin2[k];
1128         buf_2[k].imag = -1.0f * ( data[q + 1] * xcos2[k] + data[p + 1] * xsin2[k]);
1129     }
1130
1131     /* IFFT Bit reversed shuffling */
1132     for(i=0; i<64; i++) {
1133         k = bit_reverse_256[i];
1134         if (k < i) {
1135             swap_cmplx(&buf_1[i],&buf_1[k]);
1136             swap_cmplx(&buf_2[i],&buf_2[k]);
1137         }
1138     }
1139
1140     /* FFT Merge */
1141     for (m=0; m < 6; m++) {
1142         two_m = (1 << m);
1143         two_m_plus_one = (1 << (m+1));
1144
1145         /* FIXME */
1146         if(m)
1147             two_m = (1 << m);
1148         else
1149             two_m = 1;
1150
1151         for(k = 0; k < two_m; k++) {
1152             for(i = 0; i < 64; i += two_m_plus_one) {
1153                 p = k + i;
1154                 q = p + two_m;
1155                 /* Do block 1 */
1156                 tmp_a_r = buf_1[p].real;
1157                 tmp_a_i = buf_1[p].imag;
1158                 tmp_b_r = buf_1[q].real * w[m][k].real - buf_1[q].imag * w[m][k].imag;
1159                 tmp_b_i = buf_1[q].imag * w[m][k].real + buf_1[q].real * w[m][k].imag;
1160                 buf_1[p].real = tmp_a_r + tmp_b_r;
1161                 buf_1[p].imag =  tmp_a_i + tmp_b_i;
1162                 buf_1[q].real = tmp_a_r - tmp_b_r;
1163                 buf_1[q].imag =  tmp_a_i - tmp_b_i;
1164
1165                 /* Do block 2 */
1166                 tmp_a_r = buf_2[p].real;
1167                 tmp_a_i = buf_2[p].imag;
1168                 tmp_b_r = buf_2[q].real * w[m][k].real - buf_2[q].imag * w[m][k].imag;
1169                 tmp_b_i = buf_2[q].imag * w[m][k].real + buf_2[q].real * w[m][k].imag;
1170                 buf_2[p].real = tmp_a_r + tmp_b_r;
1171                 buf_2[p].imag =  tmp_a_i + tmp_b_i;
1172                 buf_2[q].real = tmp_a_r - tmp_b_r;
1173                 buf_2[q].imag =  tmp_a_i - tmp_b_i;
1174             }
1175         }
1176     }
1177
1178     /* Post IFFT complex multiply */
1179     for( i=0; i < 64; i++) {
1180         /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
1181         tmp_a_r =  buf_1[i].real;
1182         tmp_a_i = -buf_1[i].imag;
1183         buf_1[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1184         buf_1[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1185         /* y2[n] = z2[n] * (xcos2[n] + j * xsin2[n]) ; */
1186         tmp_a_r =  buf_2[i].real;
1187         tmp_a_i = -buf_2[i].imag;
1188         buf_2[i].real =(tmp_a_r * xcos2[i])  -  (tmp_a_i  * xsin2[i]);
1189         buf_2[i].imag =(tmp_a_r * xsin2[i])  +  (tmp_a_i  * xcos2[i]);
1190     }
1191
1192     data_ptr = data;
1193     delay_ptr = delay;
1194     window_ptr = imdct_window;
1195
1196     /* Window and convert to real valued signal */
1197     for(i=0; i< 64; i++) {
1198         *data_ptr++  = -buf_1[i].imag      * *window_ptr++ + *delay_ptr++ + bias;
1199         *data_ptr++  =  buf_1[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1200     }
1201
1202     for(i=0; i< 64; i++) {
1203         *data_ptr++  = -buf_1[i].real      * *window_ptr++ + *delay_ptr++ + bias;
1204         *data_ptr++  =  buf_1[64-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1205     }
1206
1207     delay_ptr = delay;
1208
1209     for(i=0; i< 64; i++) {
1210         *delay_ptr++ = -buf_2[i].real      * *--window_ptr;
1211         *delay_ptr++ =  buf_2[64-i-1].imag * *--window_ptr;
1212     }
1213
1214     for(i=0; i< 64; i++) {
1215         *delay_ptr++ =  buf_2[i].imag      * *--window_ptr;
1216         *delay_ptr++ = -buf_2[64-i-1].real * *--window_ptr;
1217     }
1218 }
1219
1220 void imdct_init (uint32_t mm_accel)
1221 {
1222 #ifdef LIBA52_MLIB
1223     if (mm_accel & MM_ACCEL_MLIB) {
1224         fprintf (stderr, "Using mlib for IMDCT transform\n");
1225         imdct_512 = imdct_do_512_mlib;
1226         imdct_256 = imdct_do_256_mlib;
1227     } else
1228 #endif
1229     {
1230         int i, j, k;
1231
1232         /* Twiddle factors to turn IFFT into IMDCT */
1233         for (i = 0; i < 128; i++) {
1234             xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
1235             xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
1236         }
1237 #ifdef ARCH_X86
1238         for (i = 0; i < 128; i++) {
1239             sseSinCos1c[2*i+0]= xcos1[i];
1240             sseSinCos1c[2*i+1]= -xcos1[i];
1241             sseSinCos1d[2*i+0]= xsin1[i];
1242             sseSinCos1d[2*i+1]= xsin1[i];
1243         }
1244 #endif
1245
1246         /* More twiddle factors to turn IFFT into IMDCT */
1247         for (i = 0; i < 64; i++) {
1248             xcos2[i] = -cos ((M_PI / 1024) * (8 * i + 1));
1249             xsin2[i] = -sin ((M_PI / 1024) * (8 * i + 1));
1250         }
1251
1252         for (i = 0; i < 7; i++) {
1253             j = 1 << i;
1254             for (k = 0; k < j; k++) {
1255                 w[i][k].real = cos (-M_PI * k / j);
1256                 w[i][k].imag = sin (-M_PI * k / j);
1257             }
1258         }
1259 #ifdef ARCH_X86
1260         for (i = 1; i < 7; i++) {
1261             j = 1 << i;
1262             for (k = 0; k < j; k+=2) {
1263
1264                 sseW[i][4*k + 0] = w[i][k+0].real;
1265                 sseW[i][4*k + 1] = w[i][k+0].real;
1266                 sseW[i][4*k + 2] = w[i][k+1].real;
1267                 sseW[i][4*k + 3] = w[i][k+1].real;
1268
1269                 sseW[i][4*k + 4] = -w[i][k+0].imag;
1270                 sseW[i][4*k + 5] = w[i][k+0].imag;
1271                 sseW[i][4*k + 6] = -w[i][k+1].imag;
1272                 sseW[i][4*k + 7] = w[i][k+1].imag;
1273
1274         //we multiply more or less uninitalized numbers so we need to use exactly 0.0
1275                 if(k==0)
1276                 {
1277 //                      sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
1278                         sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
1279                 }
1280
1281                 if(2*k == j)
1282                 {
1283                         sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
1284 //                      sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
1285                 }
1286             }
1287         }
1288
1289         for(i=0; i<128; i++)
1290         {
1291                 sseWindow[2*i+0]= -imdct_window[2*i+0];
1292                 sseWindow[2*i+1]=  imdct_window[2*i+1];
1293         }
1294
1295         for(i=0; i<64; i++)
1296         {
1297                 sseWindow[256 + 2*i+0]= -imdct_window[254 - 2*i+1];
1298                 sseWindow[256 + 2*i+1]=  imdct_window[254 - 2*i+0];
1299                 sseWindow[384 + 2*i+0]=  imdct_window[126 - 2*i+1];
1300                 sseWindow[384 + 2*i+1]= -imdct_window[126 - 2*i+0];
1301         }
1302 #endif // arch_x86
1303
1304         imdct_512 = imdct_do_512;
1305 #ifdef ARCH_X86
1306         if(mm_accel & MM_ACCEL_X86_SSE)
1307         {
1308           fprintf (stderr, "Using SSE optimized IMDCT transform\n");
1309           imdct_512 = imdct_do_512_sse;
1310         }
1311         else
1312         if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
1313         {
1314           fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
1315           imdct_512 = imdct_do_512_3dnowex;
1316         }
1317         else
1318         if(mm_accel & MM_ACCEL_X86_3DNOW)
1319         {
1320           fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
1321           imdct_512 = imdct_do_512_3dnow;
1322         }
1323         else
1324 #endif // arch_x86
1325 #ifdef HAVE_ALTIVEC
1326         if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
1327         {
1328           fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
1329           imdct_512 = imdct_do_512_altivec;
1330         }
1331         else
1332 #endif
1333         fprintf (stderr, "No accelerated IMDCT transform found\n");
1334         imdct_256 = imdct_do_256;
1335     }
1336 }
1337
1338 static void fft_asmb(int k, complex_t *x, complex_t *wTB,
1339              const complex_t *d, const complex_t *d_3)
1340 {
1341   register complex_t  *x2k, *x3k, *x4k, *wB;
1342   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1343
1344   x2k = x + 2 * k;
1345   x3k = x2k + 2 * k;
1346   x4k = x3k + 2 * k;
1347   wB = wTB + 2 * k;
1348
1349   TRANSZERO(x[0],x2k[0],x3k[0],x4k[0]);
1350   TRANS(x[1],x2k[1],x3k[1],x4k[1],wTB[1],wB[1],d[1],d_3[1]);
1351
1352   --k;
1353   for(;;) {
1354      TRANS(x[2],x2k[2],x3k[2],x4k[2],wTB[2],wB[2],d[2],d_3[2]);
1355      TRANS(x[3],x2k[3],x3k[3],x4k[3],wTB[3],wB[3],d[3],d_3[3]);
1356      if (!--k) break;
1357      x += 2;
1358      x2k += 2;
1359      x3k += 2;
1360      x4k += 2;
1361      d += 2;
1362      d_3 += 2;
1363      wTB += 2;
1364      wB += 2;
1365   }
1366
1367 }
1368
1369 static void fft_asmb16(complex_t *x, complex_t *wTB)
1370 {
1371   register float a_r, a_i, a1_r, a1_i, u_r, u_i, v_r, v_i;
1372   int k = 2;
1373
1374   /* transform x[0], x[8], x[4], x[12] */
1375   TRANSZERO(x[0],x[4],x[8],x[12]);
1376
1377   /* transform x[1], x[9], x[5], x[13] */
1378   TRANS(x[1],x[5],x[9],x[13],wTB[1],wTB[5],delta16[1],delta16_3[1]);
1379
1380   /* transform x[2], x[10], x[6], x[14] */
1381   TRANSHALF_16(x[2],x[6],x[10],x[14]);
1382
1383   /* transform x[3], x[11], x[7], x[15] */
1384   TRANS(x[3],x[7],x[11],x[15],wTB[3],wTB[7],delta16[3],delta16_3[3]);
1385
1386 }
1387
1388 static void fft_4(complex_t *x)
1389 {
1390   /* delta_p = 1 here */
1391   /* x[k] = sum_{i=0..3} x[i] * w^{i*k}, w=e^{-2*pi/4}
1392    */
1393
1394   register float yt_r, yt_i, yb_r, yb_i, u_r, u_i, vi_r, vi_i;
1395
1396   yt_r = x[0].real;
1397   yb_r = yt_r - x[2].real;
1398   yt_r += x[2].real;
1399
1400   u_r = x[1].real;
1401   vi_i = x[3].real - u_r;
1402   u_r += x[3].real;
1403
1404   u_i = x[1].imag;
1405   vi_r = u_i - x[3].imag;
1406   u_i += x[3].imag;
1407
1408   yt_i = yt_r;
1409   yt_i += u_r;
1410   x[0].real = yt_i;
1411   yt_r -= u_r;
1412   x[2].real = yt_r;
1413   yt_i = yb_r;
1414   yt_i += vi_r;
1415   x[1].real = yt_i;
1416   yb_r -= vi_r;
1417   x[3].real = yb_r;
1418
1419   yt_i = x[0].imag;
1420   yb_i = yt_i - x[2].imag;
1421   yt_i += x[2].imag;
1422
1423   yt_r = yt_i;
1424   yt_r += u_i;
1425   x[0].imag = yt_r;
1426   yt_i -= u_i;
1427   x[2].imag = yt_i;
1428   yt_r = yb_i;
1429   yt_r += vi_i;
1430   x[1].imag = yt_r;
1431   yb_i -= vi_i;
1432   x[3].imag = yb_i;
1433 }
1434
1435
1436 static void fft_8(complex_t *x)
1437 {
1438   /* delta_p = diag{1, sqrt(i)} here */
1439   /* x[k] = sum_{i=0..7} x[i] * w^{i*k}, w=e^{-2*pi/8}
1440    */
1441   register float wT1_r, wT1_i, wB1_r, wB1_i, wT2_r, wT2_i, wB2_r, wB2_i;
1442
1443   wT1_r = x[1].real;
1444   wT1_i = x[1].imag;
1445   wB1_r = x[3].real;
1446   wB1_i = x[3].imag;
1447
1448   x[1] = x[2];
1449   x[2] = x[4];
1450   x[3] = x[6];
1451   fft_4(&x[0]);
1452
1453
1454   /* x[0] x[4] */
1455   wT2_r = x[5].real;
1456   wT2_r += x[7].real;
1457   wT2_r += wT1_r;
1458   wT2_r += wB1_r;
1459   wT2_i = wT2_r;
1460   wT2_r += x[0].real;
1461   wT2_i = x[0].real - wT2_i;
1462   x[0].real = wT2_r;
1463   x[4].real = wT2_i;
1464
1465   wT2_i = x[5].imag;
1466   wT2_i += x[7].imag;
1467   wT2_i += wT1_i;
1468   wT2_i += wB1_i;
1469   wT2_r = wT2_i;
1470   wT2_r += x[0].imag;
1471   wT2_i = x[0].imag - wT2_i;
1472   x[0].imag = wT2_r;
1473   x[4].imag = wT2_i;
1474
1475   /* x[2] x[6] */
1476   wT2_r = x[5].imag;
1477   wT2_r -= x[7].imag;
1478   wT2_r += wT1_i;
1479   wT2_r -= wB1_i;
1480   wT2_i = wT2_r;
1481   wT2_r += x[2].real;
1482   wT2_i = x[2].real - wT2_i;
1483   x[2].real = wT2_r;
1484   x[6].real = wT2_i;
1485
1486   wT2_i = x[5].real;
1487   wT2_i -= x[7].real;
1488   wT2_i += wT1_r;
1489   wT2_i -= wB1_r;
1490   wT2_r = wT2_i;
1491   wT2_r += x[2].imag;
1492   wT2_i = x[2].imag - wT2_i;
1493   x[2].imag = wT2_i;
1494   x[6].imag = wT2_r;
1495
1496
1497   /* x[1] x[5] */
1498   wT2_r = wT1_r;
1499   wT2_r += wB1_i;
1500   wT2_r -= x[5].real;
1501   wT2_r -= x[7].imag;
1502   wT2_i = wT1_i;
1503   wT2_i -= wB1_r;
1504   wT2_i -= x[5].imag;
1505   wT2_i += x[7].real;
1506
1507   wB2_r = wT2_r;
1508   wB2_r += wT2_i;
1509   wT2_i -= wT2_r;
1510   wB2_r *= HSQRT2;
1511   wT2_i *= HSQRT2;
1512   wT2_r = wB2_r;
1513   wB2_r += x[1].real;
1514   wT2_r =  x[1].real - wT2_r;
1515
1516   wB2_i = x[5].real;
1517   x[1].real = wB2_r;
1518   x[5].real = wT2_r;
1519
1520   wT2_r = wT2_i;
1521   wT2_r += x[1].imag;
1522   wT2_i = x[1].imag - wT2_i;
1523   wB2_r = x[5].imag;
1524   x[1].imag = wT2_r;
1525   x[5].imag = wT2_i;
1526
1527   /* x[3] x[7] */
1528   wT1_r -= wB1_i;
1529   wT1_i += wB1_r;
1530   wB1_r = wB2_i - x[7].imag;
1531   wB1_i = wB2_r + x[7].real;
1532   wT1_r -= wB1_r;
1533   wT1_i -= wB1_i;
1534   wB1_r = wT1_r + wT1_i;
1535   wB1_r *= HSQRT2;
1536   wT1_i -= wT1_r;
1537   wT1_i *= HSQRT2;
1538   wB2_r = x[3].real;
1539   wB2_i = wB2_r + wT1_i;
1540   wB2_r -= wT1_i;
1541   x[3].real = wB2_i;
1542   x[7].real = wB2_r;
1543   wB2_i = x[3].imag;
1544   wB2_r = wB2_i + wB1_r;
1545   wB2_i -= wB1_r;
1546   x[3].imag = wB2_i;
1547   x[7].imag = wB2_r;
1548 }
1549
1550
1551 static void fft_128p(complex_t *a)
1552 {
1553   fft_8(&a[0]); fft_4(&a[8]); fft_4(&a[12]);
1554   fft_asmb16(&a[0], &a[8]);
1555
1556   fft_8(&a[16]), fft_8(&a[24]);
1557   fft_asmb(4, &a[0], &a[16],&delta32[0], &delta32_3[0]);
1558
1559   fft_8(&a[32]); fft_4(&a[40]); fft_4(&a[44]);
1560   fft_asmb16(&a[32], &a[40]);
1561
1562   fft_8(&a[48]); fft_4(&a[56]); fft_4(&a[60]);
1563   fft_asmb16(&a[48], &a[56]);
1564
1565   fft_asmb(8, &a[0], &a[32],&delta64[0], &delta64_3[0]);
1566
1567   fft_8(&a[64]); fft_4(&a[72]); fft_4(&a[76]);
1568   /* fft_16(&a[64]); */
1569   fft_asmb16(&a[64], &a[72]);
1570
1571   fft_8(&a[80]); fft_8(&a[88]);
1572
1573   /* fft_32(&a[64]); */
1574   fft_asmb(4, &a[64], &a[80],&delta32[0], &delta32_3[0]);
1575
1576   fft_8(&a[96]); fft_4(&a[104]), fft_4(&a[108]);
1577   /* fft_16(&a[96]); */
1578   fft_asmb16(&a[96], &a[104]);
1579
1580   fft_8(&a[112]), fft_8(&a[120]);
1581   /* fft_32(&a[96]); */
1582   fft_asmb(4, &a[96], &a[112], &delta32[0], &delta32_3[0]);
1583
1584   /* fft_128(&a[0]); */
1585   fft_asmb(16, &a[0], &a[64], &delta128[0], &delta128_3[0]);
1586 }
1587
1588
1589