liba52/liba52_changes.diff

   1 --- include/a52.h       2006-06-12 15:04:57.000000000 +0200
   2 +++ liba52/a52.h        2006-06-05 02:23:02.000000000 +0200
   3 @@ -59,4 +66,9 @@
   4  int a52_block (a52_state_t * state);
   5  void a52_free (a52_state_t * state);
   6
   7 +void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
   8 +extern int (* a52_resample) (float * _f, int16_t * s16);
   9 +
  10 +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes);
  11 +
  12  #endif /* A52_H */
  13 --- liba52/a52_internal.h       2006-06-12 15:05:07.000000000 +0200
  14 +++ liba52/a52_internal.h       2006-06-05 02:23:02.000000000 +0200
  15 @@ -103,18 +107,34 @@
  16  #define DELTA_BIT_NONE (2)
  17  #define DELTA_BIT_RESERVED (3)
  18
  19 +#if ARCH_X86_64
  20 +# define REG_a "rax"
  21 +# define REG_d "rdx"
  22 +# define REG_S "rsi"
  23 +# define REG_D "rdi"
  24 +# define REG_BP "rbp"
  25 +#else
  26 +# define REG_a "eax"
  27 +# define REG_d "edx"
  28 +# define REG_S "esi"
  29 +# define REG_D "edi"
  30 +# define REG_BP "ebp"
  31 +#endif
  32 +
  33  void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
  34                        int start, int end, int fastleak, int slowleak,
  35                        expbap_t * expbap);
  36
  37  int a52_downmix_init (int input, int flags, sample_t * level,
  38                       sample_t clev, sample_t slev);
  39 +void downmix_accel_init(uint32_t mm_accel);
  40  int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
  41                        sample_t clev, sample_t slev);
  42 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
  43 +extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias,
  44                   sample_t clev, sample_t slev);
  45 -void a52_upmix (sample_t * samples, int acmod, int output);
  46 +extern void (*a52_upmix) (sample_t * samples, int acmod, int output);
  47
  48  void a52_imdct_init (uint32_t mm_accel);
  49  void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
  50 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
  51 +extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
  52 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias);
  53 --- liba52/bitstream.c  2006-06-12 15:05:07.000000000 +0200
  54 +++ liba52/bitstream.c  2006-06-05 02:23:02.000000000 +0200
  55 @@ -31,6 +35,10 @@
  56
  57  #define BUFFER_SIZE 4096
  58
  59 +#ifdef ALT_BITSTREAM_READER
  60 +int indx=0;
  61 +#endif
  62 +
  63  void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf)
  64  {
  65      int align;
  66 @@ -38,6 +46,9 @@
  67      align = (long)buf & 3;
  68      state->buffer_start = (uint32_t *) (buf - align);
  69      state->bits_left = 0;
  70 +#ifdef ALT_BITSTREAM_READER
  71 +    indx=0;
  72 +#endif
  73      bitstream_get (state, align * 8);
  74  }
  75
  76 --- liba52/bitstream.h  2006-06-12 15:05:07.000000000 +0200
  77 +++ liba52/bitstream.h  2006-06-05 02:23:02.000000000 +0200
  78 @@ -21,6 +25,42 @@
  79   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  80   */
  81
  82 +/* code from ffmpeg/libavcodec */
  83 +#if defined(__sparc__) || defined(hpux)
  84 +/*
  85 + * the alt bitstream reader performs unaligned memory accesses; that doesn't work
  86 + * on sparc/hpux.  For now, disable ALT_BITSTREAM_READER.
  87 + */
  88 +#undef ALT_BITSTREAM_READER
  89 +#else
  90 +// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input)
  91 +#define ALT_BITSTREAM_READER
  92 +
  93 +/* used to avoid misaligned exceptions on some archs (alpha, ...) */
  94 +#if ARCH_X86 || HAVE_ARMV6
  95 +#    define unaligned32(a) (*(uint32_t*)(a))
  96 +#else
  97 +#    ifdef __GNUC__
  98 +static inline uint32_t unaligned32(const void *v) {
  99 +    struct Unaligned {
 100 +       uint32_t i;
 101 +    } __attribute__((packed));
 102 +
 103 +    return ((const struct Unaligned *) v)->i;
 104 +}
 105 +#    elif defined(__DECC)
 106 +static inline uint32_t unaligned32(const void *v) {
 107 +    return *(const __unaligned uint32_t *) v;
 108 +}
 109 +#    else
 110 +static inline uint32_t unaligned32(const void *v) {
 111 +    return *(const uint32_t *) v;
 112 +}
 113 +#    endif
 114 +#endif //!ARCH_X86
 115 +
 116 +#endif
 117 +
 118  /* (stolen from the kernel) */
 119  #ifdef WORDS_BIGENDIAN
 120
 121 @@ -28,7 +68,7 @@
 122
 123  #else
 124
 125 -#      if 0 && defined (__i386__)
 126 +#      if defined (__i386__)
 127
 128  #      define swab32(x) __i386_swab32(x)
 129         static inline const uint32_t __i386_swab32(uint32_t x)
 130 @@ -39,19 +79,34 @@
 131
 132  #      else
 133
 134 -#      define swab32(x)\
 135 -((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |  \
 136 - (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]))
 137 -
 138 +#      define swab32(x) __generic_swab32(x)
 139 +       static inline const uint32_t __generic_swab32(uint32_t x)
 140 +       {
 141 +               return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |
 142 +                (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]));
 143 +       }
 144  #      endif
 145  #endif
 146
 147 +#ifdef ALT_BITSTREAM_READER
 148 +extern int indx;
 149 +#endif
 150 +
 151  void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf);
 152  uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits);
 153  int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits);
 154
 155  static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits)
 156  {
 157 +#ifdef ALT_BITSTREAM_READER
 158 +    uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
 159 +
 160 +    result<<= (indx&0x07);
 161 +    result>>= 32 - num_bits;
 162 +    indx+= num_bits;
 163 +
 164 +    return result;
 165 +#else
 166      uint32_t result;
 167
 168      if (num_bits < state->bits_left) {
 169 @@ -61,10 +116,29 @@
 170      }
 171
 172      return a52_bitstream_get_bh (state, num_bits);
 173 +#endif
 174 +}
 175 +
 176 +static inline void bitstream_skip(a52_state_t * state, int num_bits)
 177 +{
 178 +#ifdef ALT_BITSTREAM_READER
 179 +       indx+= num_bits;
 180 +#else
 181 +       bitstream_get(state, num_bits);
 182 +#endif
 183  }
 184
 185  static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits)
 186  {
 187 +#ifdef ALT_BITSTREAM_READER
 188 +    int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
 189 +
 190 +    result<<= (indx&0x07);
 191 +    result>>= 32 - num_bits;
 192 +    indx+= num_bits;
 193 +
 194 +    return result;
 195 +#else
 196      int32_t result;
 197
 198      if (num_bits < state->bits_left) {
 199 @@ -74,4 +148,5 @@
 200      }
 201
 202      return a52_bitstream_get_bh_2 (state, num_bits);
 203 +#endif
 204  }
 205 --- liba52/downmix.c    2006-06-12 15:17:53.000000000 +0200
 206 +++ liba52/downmix.c    2006-06-05 02:23:02.000000000 +0200
 207 @@ -19,18 +23,46 @@
 208   * You should have received a copy of the GNU General Public License
 209   * along with this program; if not, write to the Free Software
 210   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 211 + *
 212 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
 213   */
 214
 215  #include "config.h"
 216
 217  #include <string.h>
 218  #include <inttypes.h>
 219
 220  #include "a52.h"
 221  #include "a52_internal.h"
 222 +#include "mm_accel.h"
 223
 224  #define CONVERT(acmod,output) (((output) << 3) + (acmod))
 225
 226 +
 227 +void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias,
 228 +             sample_t clev, sample_t slev)= NULL;
 229 +void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL;
 230 +
 231 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
 232 +             sample_t clev, sample_t slev);
 233 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
 234 +             sample_t clev, sample_t slev);
 235 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
 236 +             sample_t clev, sample_t slev);
 237 +static void upmix_MMX (sample_t * samples, int acmod, int output);
 238 +static void upmix_C (sample_t * samples, int acmod, int output);
 239 +
 240 +void downmix_accel_init(uint32_t mm_accel)
 241 +{
 242 +    a52_upmix= upmix_C;
 243 +    a52_downmix= downmix_C;
 244 +#if ARCH_X86 || ARCH_X86_64
 245 +    if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX;
 246 +    if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE;
 247 +    if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow;
 248 +#endif
 249 +}
 250 +
 251  int a52_downmix_init (int input, int flags, sample_t * level,
 252                       sample_t clev, sample_t slev)
 253  {
 254 @@ -447,7 +479,7 @@
 255         samples[i] = 0;
 256  }
 257
 258 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
 259 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
 260                   sample_t clev, sample_t slev)
 261  {
 262      switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 263 @@ -559,7 +591,7 @@
 264         break;
 265
 266      case CONVERT (A52_3F2R, A52_2F1R):
 267 -       mix3to2 (samples, bias);
 268 +       mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
 269         move2to1 (samples + 768, samples + 512, bias);
 270         break;
 271
 272 @@ -583,12 +615,12 @@
 273         break;
 274
 275      case CONVERT (A52_3F1R, A52_3F2R):
 276 -       memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t));
 277 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 278         break;
 279      }
 280  }
 281
 282 -void a52_upmix (sample_t * samples, int acmod, int output)
 283 +void upmix_C (sample_t * samples, int acmod, int output)
 284  {
 285      switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 286
 287 @@ -653,3 +685,1104 @@
 288         goto mix_31to21;
 289      }
 290  }
 291 +
 292 +#if ARCH_X86 || ARCH_X86_64
 293 +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
 294 +{
 295 +       __asm__ volatile(
 296 +       "movlps %2, %%xmm7              \n\t"
 297 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 298 +       "mov $-1024, %%"REG_S"          \n\t"
 299 +       ASMALIGN(4)
 300 +       "1:                             \n\t"
 301 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 302 +       "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
 303 +       "addps (%1, %%"REG_S"), %%xmm0  \n\t"
 304 +       "addps 16(%1, %%"REG_S"), %%xmm1\n\t"
 305 +       "addps %%xmm7, %%xmm0           \n\t"
 306 +       "addps %%xmm7, %%xmm1           \n\t"
 307 +       "movaps %%xmm0, (%1, %%"REG_S") \n\t"
 308 +       "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
 309 +       "add $32, %%"REG_S"             \n\t"
 310 +       " jnz 1b                        \n\t"
 311 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 312 +       : "%"REG_S
 313 +       );
 314 +}
 315 +
 316 +static void mix3to1_SSE (sample_t * samples, sample_t bias)
 317 +{
 318 +       __asm__ volatile(
 319 +       "movlps %1, %%xmm7              \n\t"
 320 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 321 +       "mov $-1024, %%"REG_S"          \n\t"
 322 +       ASMALIGN(4)
 323 +       "1:                             \n\t"
 324 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 325 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 326 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 327 +       "addps %%xmm7, %%xmm1           \n\t"
 328 +       "addps %%xmm1, %%xmm0           \n\t"
 329 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 330 +       "add $16, %%"REG_S"             \n\t"
 331 +       " jnz 1b                        \n\t"
 332 +       :: "r" (samples+256), "m" (bias)
 333 +       : "%"REG_S
 334 +       );
 335 +}
 336 +
 337 +static void mix4to1_SSE (sample_t * samples, sample_t bias)
 338 +{
 339 +       __asm__ volatile(
 340 +       "movlps %1, %%xmm7              \n\t"
 341 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 342 +       "mov $-1024, %%"REG_S"          \n\t"
 343 +       ASMALIGN(4)
 344 +       "1:                             \n\t"
 345 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 346 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 347 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 348 +       "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
 349 +       "addps %%xmm7, %%xmm0           \n\t"
 350 +       "addps %%xmm1, %%xmm0           \n\t"
 351 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 352 +       "add $16, %%"REG_S"             \n\t"
 353 +       " jnz 1b                        \n\t"
 354 +       :: "r" (samples+256), "m" (bias)
 355 +       : "%"REG_S
 356 +       );
 357 +}
 358 +
 359 +static void mix5to1_SSE (sample_t * samples, sample_t bias)
 360 +{
 361 +       __asm__ volatile(
 362 +       "movlps %1, %%xmm7              \n\t"
 363 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 364 +       "mov $-1024, %%"REG_S"          \n\t"
 365 +       ASMALIGN(4)
 366 +       "1:                             \n\t"
 367 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 368 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 369 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 370 +       "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
 371 +       "addps %%xmm7, %%xmm0           \n\t"
 372 +       "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
 373 +       "addps %%xmm1, %%xmm0           \n\t"
 374 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 375 +       "add $16, %%"REG_S"             \n\t"
 376 +       " jnz 1b                        \n\t"
 377 +       :: "r" (samples+256), "m" (bias)
 378 +       : "%"REG_S
 379 +       );
 380 +}
 381 +
 382 +static void mix3to2_SSE (sample_t * samples, sample_t bias)
 383 +{
 384 +       __asm__ volatile(
 385 +       "movlps %1, %%xmm7              \n\t"
 386 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 387 +       "mov $-1024, %%"REG_S"          \n\t"
 388 +       ASMALIGN(4)
 389 +       "1:                             \n\t"
 390 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 391 +       "addps %%xmm7, %%xmm0           \n\t" //common
 392 +       "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 393 +       "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 394 +       "addps %%xmm0, %%xmm1           \n\t"
 395 +       "addps %%xmm0, %%xmm2           \n\t"
 396 +       "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 397 +       "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 398 +       "add $16, %%"REG_S"             \n\t"
 399 +       " jnz 1b                        \n\t"
 400 +       :: "r" (samples+256), "m" (bias)
 401 +       : "%"REG_S
 402 +       );
 403 +}
 404 +
 405 +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias)
 406 +{
 407 +       __asm__ volatile(
 408 +               "movlps %2, %%xmm7              \n\t"
 409 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 410 +               "mov $-1024, %%"REG_S"          \n\t"
 411 +               ASMALIGN(4)
 412 +               "1:                             \n\t"
 413 +               "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t"
 414 +               "addps %%xmm7, %%xmm0           \n\t" //common
 415 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 416 +               "movaps (%1, %%"REG_S"), %%xmm2 \n\t"
 417 +               "addps %%xmm0, %%xmm1           \n\t"
 418 +               "addps %%xmm0, %%xmm2           \n\t"
 419 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 420 +               "movaps %%xmm2, (%1, %%"REG_S") \n\t"
 421 +               "add $16, %%"REG_S"             \n\t"
 422 +               " jnz 1b                        \n\t"
 423 +       :: "r" (left+256), "r" (right+256), "m" (bias)
 424 +       : "%"REG_S
 425 +       );
 426 +}
 427 +
 428 +static void mix21toS_SSE (sample_t * samples, sample_t bias)
 429 +{
 430 +       __asm__ volatile(
 431 +               "movlps %1, %%xmm7              \n\t"
 432 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 433 +               "mov $-1024, %%"REG_S"          \n\t"
 434 +               ASMALIGN(4)
 435 +               "1:                             \n\t"
 436 +               "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"  // surround
 437 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 438 +               "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
 439 +               "addps %%xmm7, %%xmm1           \n\t"
 440 +               "addps %%xmm7, %%xmm2           \n\t"
 441 +               "subps %%xmm0, %%xmm1           \n\t"
 442 +               "addps %%xmm0, %%xmm2           \n\t"
 443 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 444 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 445 +               "add $16, %%"REG_S"             \n\t"
 446 +               " jnz 1b                        \n\t"
 447 +       :: "r" (samples+256), "m" (bias)
 448 +       : "%"REG_S
 449 +       );
 450 +}
 451 +
 452 +static void mix31to2_SSE (sample_t * samples, sample_t bias)
 453 +{
 454 +       __asm__ volatile(
 455 +               "movlps %1, %%xmm7              \n\t"
 456 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 457 +               "mov $-1024, %%"REG_S"          \n\t"
 458 +               ASMALIGN(4)
 459 +               "1:                             \n\t"
 460 +               "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 461 +               "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
 462 +               "addps %%xmm7, %%xmm0           \n\t" // common
 463 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 464 +               "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 465 +               "addps %%xmm0, %%xmm1           \n\t"
 466 +               "addps %%xmm0, %%xmm2           \n\t"
 467 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 468 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 469 +               "add $16, %%"REG_S"             \n\t"
 470 +               " jnz 1b                        \n\t"
 471 +       :: "r" (samples+256), "m" (bias)
 472 +       : "%"REG_S
 473 +       );
 474 +}
 475 +
 476 +static void mix31toS_SSE (sample_t * samples, sample_t bias)
 477 +{
 478 +       __asm__ volatile(
 479 +               "movlps %1, %%xmm7              \n\t"
 480 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 481 +               "mov $-1024, %%"REG_S"          \n\t"
 482 +               ASMALIGN(4)
 483 +               "1:                             \n\t"
 484 +               "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 485 +               "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
 486 +               "addps %%xmm7, %%xmm0           \n\t" // common
 487 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 488 +               "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 489 +               "addps %%xmm0, %%xmm1           \n\t"
 490 +               "addps %%xmm0, %%xmm2           \n\t"
 491 +               "subps %%xmm3, %%xmm1           \n\t"
 492 +               "addps %%xmm3, %%xmm2           \n\t"
 493 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 494 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 495 +               "add $16, %%"REG_S"             \n\t"
 496 +               " jnz 1b                        \n\t"
 497 +       :: "r" (samples+256), "m" (bias)
 498 +       : "%"REG_S
 499 +       );
 500 +}
 501 +
 502 +static void mix22toS_SSE (sample_t * samples, sample_t bias)
 503 +{
 504 +       __asm__ volatile(
 505 +               "movlps %1, %%xmm7              \n\t"
 506 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 507 +               "mov $-1024, %%"REG_S"          \n\t"
 508 +               ASMALIGN(4)
 509 +               "1:                             \n\t"
 510 +               "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 511 +               "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
 512 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 513 +               "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
 514 +               "addps %%xmm7, %%xmm1           \n\t"
 515 +               "addps %%xmm7, %%xmm2           \n\t"
 516 +               "subps %%xmm0, %%xmm1           \n\t"
 517 +               "addps %%xmm0, %%xmm2           \n\t"
 518 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 519 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 520 +               "add $16, %%"REG_S"             \n\t"
 521 +               " jnz 1b                        \n\t"
 522 +       :: "r" (samples+256), "m" (bias)
 523 +       : "%"REG_S
 524 +       );
 525 +}
 526 +
 527 +static void mix32to2_SSE (sample_t * samples, sample_t bias)
 528 +{
 529 +       __asm__ volatile(
 530 +       "movlps %1, %%xmm7              \n\t"
 531 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 532 +       "mov $-1024, %%"REG_S"          \n\t"
 533 +       ASMALIGN(4)
 534 +       "1:                             \n\t"
 535 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 536 +       "addps %%xmm7, %%xmm0           \n\t" // common
 537 +       "movaps %%xmm0, %%xmm1          \n\t" // common
 538 +       "addps (%0, %%"REG_S"), %%xmm0  \n\t"
 539 +       "addps 2048(%0, %%"REG_S"), %%xmm1\n\t"
 540 +       "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
 541 +       "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
 542 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 543 +       "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t"
 544 +       "add $16, %%"REG_S"             \n\t"
 545 +       " jnz 1b                        \n\t"
 546 +       :: "r" (samples+256), "m" (bias)
 547 +       : "%"REG_S
 548 +       );
 549 +}
 550 +
 551 +static void mix32toS_SSE (sample_t * samples, sample_t bias)
 552 +{
 553 +       __asm__ volatile(
 554 +       "movlps %1, %%xmm7              \n\t"
 555 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 556 +       "mov $-1024, %%"REG_S"          \n\t"
 557 +       ASMALIGN(4)
 558 +       "1:                             \n\t"
 559 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 560 +       "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t"
 561 +       "addps %%xmm7, %%xmm0           \n\t" // common
 562 +       "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround
 563 +       "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 564 +       "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t"
 565 +       "subps %%xmm2, %%xmm1           \n\t"
 566 +       "addps %%xmm2, %%xmm3           \n\t"
 567 +       "addps %%xmm0, %%xmm1           \n\t"
 568 +       "addps %%xmm0, %%xmm3           \n\t"
 569 +       "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 570 +       "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t"
 571 +       "add $16, %%"REG_S"             \n\t"
 572 +       " jnz 1b                        \n\t"
 573 +       :: "r" (samples+256), "m" (bias)
 574 +       : "%"REG_S
 575 +       );
 576 +}
 577 +
 578 +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)
 579 +{
 580 +       __asm__ volatile(
 581 +               "movlps %2, %%xmm7              \n\t"
 582 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 583 +               "mov $-1024, %%"REG_S"          \n\t"
 584 +               ASMALIGN(4)
 585 +               "1:                             \n\t"
 586 +               "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 587 +               "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
 588 +               "addps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 589 +               "addps 1040(%0, %%"REG_S"), %%xmm1\n\t"
 590 +               "addps %%xmm7, %%xmm0           \n\t"
 591 +               "addps %%xmm7, %%xmm1           \n\t"
 592 +               "movaps %%xmm0, (%1, %%"REG_S") \n\t"
 593 +               "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
 594 +               "add $32, %%"REG_S"             \n\t"
 595 +               " jnz 1b                        \n\t"
 596 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 597 +       : "%"REG_S
 598 +       );
 599 +}
 600 +
 601 +static void zero_MMX(sample_t * samples)
 602 +{
 603 +       __asm__ volatile(
 604 +               "mov $-1024, %%"REG_S"          \n\t"
 605 +               "pxor %%mm0, %%mm0              \n\t"
 606 +               ASMALIGN(4)
 607 +               "1:                             \n\t"
 608 +               "movq %%mm0, (%0, %%"REG_S")    \n\t"
 609 +               "movq %%mm0, 8(%0, %%"REG_S")   \n\t"
 610 +               "movq %%mm0, 16(%0, %%"REG_S")  \n\t"
 611 +               "movq %%mm0, 24(%0, %%"REG_S")  \n\t"
 612 +               "add $32, %%"REG_S"             \n\t"
 613 +               " jnz 1b                        \n\t"
 614 +               "emms"
 615 +       :: "r" (samples+256)
 616 +       : "%"REG_S
 617 +       );
 618 +}
 619 +
 620 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
 621 +             sample_t clev, sample_t slev)
 622 +{
 623 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 624 +
 625 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
 626 +       memcpy (samples, samples + 256, 256 * sizeof (sample_t));
 627 +       break;
 628 +
 629 +    case CONVERT (A52_CHANNEL, A52_MONO):
 630 +    case CONVERT (A52_STEREO, A52_MONO):
 631 +    mix_2to1_SSE:
 632 +       mix2to1_SSE (samples, samples + 256, bias);
 633 +       break;
 634 +
 635 +    case CONVERT (A52_2F1R, A52_MONO):
 636 +       if (slev == 0)
 637 +           goto mix_2to1_SSE;
 638 +    case CONVERT (A52_3F, A52_MONO):
 639 +    mix_3to1_SSE:
 640 +       mix3to1_SSE (samples, bias);
 641 +       break;
 642 +
 643 +    case CONVERT (A52_3F1R, A52_MONO):
 644 +       if (slev == 0)
 645 +           goto mix_3to1_SSE;
 646 +    case CONVERT (A52_2F2R, A52_MONO):
 647 +       if (slev == 0)
 648 +           goto mix_2to1_SSE;
 649 +       mix4to1_SSE (samples, bias);
 650 +       break;
 651 +
 652 +    case CONVERT (A52_3F2R, A52_MONO):
 653 +       if (slev == 0)
 654 +           goto mix_3to1_SSE;
 655 +       mix5to1_SSE (samples, bias);
 656 +       break;
 657 +
 658 +    case CONVERT (A52_MONO, A52_DOLBY):
 659 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
 660 +       break;
 661 +
 662 +    case CONVERT (A52_3F, A52_STEREO):
 663 +    case CONVERT (A52_3F, A52_DOLBY):
 664 +    mix_3to2_SSE:
 665 +       mix3to2_SSE (samples, bias);
 666 +       break;
 667 +
 668 +    case CONVERT (A52_2F1R, A52_STEREO):
 669 +       if (slev == 0)
 670 +           break;
 671 +       mix21to2_SSE (samples, samples + 256, bias);
 672 +       break;
 673 +
 674 +    case CONVERT (A52_2F1R, A52_DOLBY):
 675 +       mix21toS_SSE (samples, bias);
 676 +       break;
 677 +
 678 +    case CONVERT (A52_3F1R, A52_STEREO):
 679 +       if (slev == 0)
 680 +           goto mix_3to2_SSE;
 681 +       mix31to2_SSE (samples, bias);
 682 +       break;
 683 +
 684 +    case CONVERT (A52_3F1R, A52_DOLBY):
 685 +       mix31toS_SSE (samples, bias);
 686 +       break;
 687 +
 688 +    case CONVERT (A52_2F2R, A52_STEREO):
 689 +       if (slev == 0)
 690 +           break;
 691 +       mix2to1_SSE (samples, samples + 512, bias);
 692 +       mix2to1_SSE (samples + 256, samples + 768, bias);
 693 +       break;
 694 +
 695 +    case CONVERT (A52_2F2R, A52_DOLBY):
 696 +       mix22toS_SSE (samples, bias);
 697 +       break;
 698 +
 699 +    case CONVERT (A52_3F2R, A52_STEREO):
 700 +       if (slev == 0)
 701 +           goto mix_3to2_SSE;
 702 +       mix32to2_SSE (samples, bias);
 703 +       break;
 704 +
 705 +    case CONVERT (A52_3F2R, A52_DOLBY):
 706 +       mix32toS_SSE (samples, bias);
 707 +       break;
 708 +
 709 +    case CONVERT (A52_3F1R, A52_3F):
 710 +       if (slev == 0)
 711 +           break;
 712 +       mix21to2_SSE (samples, samples + 512, bias);
 713 +       break;
 714 +
 715 +    case CONVERT (A52_3F2R, A52_3F):
 716 +       if (slev == 0)
 717 +           break;
 718 +       mix2to1_SSE (samples, samples + 768, bias);
 719 +       mix2to1_SSE (samples + 512, samples + 1024, bias);
 720 +       break;
 721 +
 722 +    case CONVERT (A52_3F1R, A52_2F1R):
 723 +       mix3to2_SSE (samples, bias);
 724 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 725 +       break;
 726 +
 727 +    case CONVERT (A52_2F2R, A52_2F1R):
 728 +       mix2to1_SSE (samples + 512, samples + 768, bias);
 729 +       break;
 730 +
 731 +    case CONVERT (A52_3F2R, A52_2F1R):
 732 +       mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
 733 +       move2to1_SSE (samples + 768, samples + 512, bias);
 734 +       break;
 735 +
 736 +    case CONVERT (A52_3F2R, A52_3F1R):
 737 +       mix2to1_SSE (samples + 768, samples + 1024, bias);
 738 +       break;
 739 +
 740 +    case CONVERT (A52_2F1R, A52_2F2R):
 741 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
 742 +       break;
 743 +
 744 +    case CONVERT (A52_3F1R, A52_2F2R):
 745 +       mix3to2_SSE (samples, bias);
 746 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 747 +       break;
 748 +
 749 +    case CONVERT (A52_3F2R, A52_2F2R):
 750 +       mix3to2_SSE (samples, bias);
 751 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 752 +       memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
 753 +       break;
 754 +
 755 +    case CONVERT (A52_3F1R, A52_3F2R):
 756 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 757 +       break;
 758 +    }
 759 +}
 760 +
 761 +static void upmix_MMX (sample_t * samples, int acmod, int output)
 762 +{
 763 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 764 +
 765 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
 766 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
 767 +       break;
 768 +
 769 +    case CONVERT (A52_3F2R, A52_MONO):
 770 +       zero_MMX (samples + 1024);
 771 +    case CONVERT (A52_3F1R, A52_MONO):
 772 +    case CONVERT (A52_2F2R, A52_MONO):
 773 +       zero_MMX (samples + 768);
 774 +    case CONVERT (A52_3F, A52_MONO):
 775 +    case CONVERT (A52_2F1R, A52_MONO):
 776 +       zero_MMX (samples + 512);
 777 +    case CONVERT (A52_CHANNEL, A52_MONO):
 778 +    case CONVERT (A52_STEREO, A52_MONO):
 779 +       zero_MMX (samples + 256);
 780 +       break;
 781 +
 782 +    case CONVERT (A52_3F2R, A52_STEREO):
 783 +    case CONVERT (A52_3F2R, A52_DOLBY):
 784 +       zero_MMX (samples + 1024);
 785 +    case CONVERT (A52_3F1R, A52_STEREO):
 786 +    case CONVERT (A52_3F1R, A52_DOLBY):
 787 +       zero_MMX (samples + 768);
 788 +    case CONVERT (A52_3F, A52_STEREO):
 789 +    case CONVERT (A52_3F, A52_DOLBY):
 790 +    mix_3to2_MMX:
 791 +       memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));
 792 +       zero_MMX (samples + 256);
 793 +       break;
 794 +
 795 +    case CONVERT (A52_2F2R, A52_STEREO):
 796 +    case CONVERT (A52_2F2R, A52_DOLBY):
 797 +       zero_MMX (samples + 768);
 798 +    case CONVERT (A52_2F1R, A52_STEREO):
 799 +    case CONVERT (A52_2F1R, A52_DOLBY):
 800 +       zero_MMX (samples + 512);
 801 +       break;
 802 +
 803 +    case CONVERT (A52_3F2R, A52_3F):
 804 +       zero_MMX (samples + 1024);
 805 +    case CONVERT (A52_3F1R, A52_3F):
 806 +    case CONVERT (A52_2F2R, A52_2F1R):
 807 +       zero_MMX (samples + 768);
 808 +       break;
 809 +
 810 +    case CONVERT (A52_3F2R, A52_3F1R):
 811 +       zero_MMX (samples + 1024);
 812 +       break;
 813 +
 814 +    case CONVERT (A52_3F2R, A52_2F1R):
 815 +       zero_MMX (samples + 1024);
 816 +    case CONVERT (A52_3F1R, A52_2F1R):
 817 +    mix_31to21_MMX:
 818 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
 819 +       goto mix_3to2_MMX;
 820 +
 821 +    case CONVERT (A52_3F2R, A52_2F2R):
 822 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 823 +       goto mix_31to21_MMX;
 824 +    }
 825 +}
 826 +
 827 +static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)
 828 +{
 829 +       __asm__ volatile(
 830 +       "movd  %2, %%mm7        \n\t"
 831 +       "punpckldq %2, %%mm7    \n\t"
 832 +       "mov $-1024, %%"REG_S"  \n\t"
 833 +       ASMALIGN(4)
 834 +       "1:                     \n\t"
 835 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 836 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 837 +       "movq  16(%0, %%"REG_S"), %%mm2 \n\t"
 838 +       "movq  24(%0, %%"REG_S"), %%mm3 \n\t"
 839 +       "pfadd (%1, %%"REG_S"), %%mm0   \n\t"
 840 +       "pfadd 8(%1, %%"REG_S"), %%mm1  \n\t"
 841 +       "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t"
 842 +       "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t"
 843 +       "pfadd %%mm7, %%mm0             \n\t"
 844 +       "pfadd %%mm7, %%mm1             \n\t"
 845 +       "pfadd %%mm7, %%mm2             \n\t"
 846 +       "pfadd %%mm7, %%mm3             \n\t"
 847 +       "movq  %%mm0, (%1, %%"REG_S")   \n\t"
 848 +       "movq  %%mm1, 8(%1, %%"REG_S")  \n\t"
 849 +       "movq  %%mm2, 16(%1, %%"REG_S") \n\t"
 850 +       "movq  %%mm3, 24(%1, %%"REG_S") \n\t"
 851 +       "add $32, %%"REG_S"             \n\t"
 852 +       " jnz 1b                        \n\t"
 853 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 854 +       : "%"REG_S
 855 +       );
 856 +}
 857 +
 858 +static void mix3to1_3dnow (sample_t * samples, sample_t bias)
 859 +{
 860 +       __asm__ volatile(
 861 +       "movd  %1, %%mm7        \n\t"
 862 +       "punpckldq %1, %%mm7    \n\t"
 863 +       "mov $-1024, %%"REG_S"  \n\t"
 864 +       ASMALIGN(4)
 865 +       "1:                     \n\t"
 866 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 867 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 868 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 869 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 870 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 871 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 872 +       "pfadd %%mm7, %%mm0             \n\t"
 873 +       "pfadd %%mm7, %%mm1             \n\t"
 874 +       "pfadd %%mm2, %%mm0             \n\t"
 875 +       "pfadd %%mm3, %%mm1             \n\t"
 876 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 877 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 878 +       "add $16, %%"REG_S"             \n\t"
 879 +       " jnz 1b                        \n\t"
 880 +       :: "r" (samples+256), "m" (bias)
 881 +       : "%"REG_S
 882 +       );
 883 +}
 884 +
 885 +static void mix4to1_3dnow (sample_t * samples, sample_t bias)
 886 +{
 887 +       __asm__ volatile(
 888 +       "movd  %1, %%mm7        \n\t"
 889 +       "punpckldq %1, %%mm7    \n\t"
 890 +       "mov $-1024, %%"REG_S"  \n\t"
 891 +       ASMALIGN(4)
 892 +       "1:                     \n\t"
 893 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 894 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 895 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 896 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 897 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 898 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 899 +       "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
 900 +       "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
 901 +       "pfadd %%mm7, %%mm0             \n\t"
 902 +       "pfadd %%mm7, %%mm1             \n\t"
 903 +       "pfadd %%mm2, %%mm0             \n\t"
 904 +       "pfadd %%mm3, %%mm1             \n\t"
 905 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 906 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 907 +       "add $16, %%"REG_S"             \n\t"
 908 +       " jnz 1b                        \n\t"
 909 +       :: "r" (samples+256), "m" (bias)
 910 +       : "%"REG_S
 911 +       );
 912 +}
 913 +
 914 +static void mix5to1_3dnow (sample_t * samples, sample_t bias)
 915 +{
 916 +       __asm__ volatile(
 917 +       "movd  %1, %%mm7        \n\t"
 918 +       "punpckldq %1, %%mm7    \n\t"
 919 +       "mov $-1024, %%"REG_S"  \n\t"
 920 +       ASMALIGN(4)
 921 +       "1:                     \n\t"
 922 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 923 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 924 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 925 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 926 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 927 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 928 +       "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
 929 +       "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
 930 +       "pfadd %%mm7, %%mm0             \n\t"
 931 +       "pfadd %%mm7, %%mm1             \n\t"
 932 +       "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t"
 933 +       "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t"
 934 +       "pfadd %%mm2, %%mm0             \n\t"
 935 +       "pfadd %%mm3, %%mm1             \n\t"
 936 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 937 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 938 +       "add $16, %%"REG_S"             \n\t"
 939 +       " jnz 1b                        \n\t"
 940 +       :: "r" (samples+256), "m" (bias)
 941 +       : "%"REG_S
 942 +       );
 943 +}
 944 +
 945 +static void mix3to2_3dnow (sample_t * samples, sample_t bias)
 946 +{
 947 +       __asm__ volatile(
 948 +       "movd  %1, %%mm7        \n\t"
 949 +       "punpckldq %1, %%mm7    \n\t"
 950 +       "mov $-1024, %%"REG_S"  \n\t"
 951 +       ASMALIGN(4)
 952 +       "1:                     \n\t"
 953 +       "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
 954 +       "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
 955 +       "pfadd  %%mm7, %%mm0            \n\t" //common
 956 +       "pfadd  %%mm7, %%mm1            \n\t" //common
 957 +       "movq   (%0, %%"REG_S"), %%mm2  \n\t"
 958 +       "movq   8(%0, %%"REG_S"), %%mm3 \n\t"
 959 +       "movq   2048(%0, %%"REG_S"), %%mm4\n\t"
 960 +       "movq   2056(%0, %%"REG_S"), %%mm5\n\t"
 961 +       "pfadd  %%mm0, %%mm2            \n\t"
 962 +       "pfadd  %%mm1, %%mm3            \n\t"
 963 +       "pfadd  %%mm0, %%mm4            \n\t"
 964 +       "pfadd  %%mm1, %%mm5            \n\t"
 965 +       "movq   %%mm2, (%0, %%"REG_S")  \n\t"
 966 +       "movq   %%mm3, 8(%0, %%"REG_S") \n\t"
 967 +       "movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
 968 +       "movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
 969 +       "add $16, %%"REG_S"             \n\t"
 970 +       " jnz 1b                        \n\t"
 971 +       :: "r" (samples+256), "m" (bias)
 972 +       : "%"REG_S
 973 +       );
 974 +}
 975 +
 976 +static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias)
 977 +{
 978 +       __asm__ volatile(
 979 +               "movd  %2, %%mm7        \n\t"
 980 +               "punpckldq %2, %%mm7    \n\t"
 981 +               "mov $-1024, %%"REG_S"  \n\t"
 982 +               ASMALIGN(4)
 983 +               "1:                     \n\t"
 984 +               "movq  1024(%1, %%"REG_S"), %%mm0\n\t"
 985 +               "movq  1032(%1, %%"REG_S"), %%mm1\n\t"
 986 +               "pfadd %%mm7, %%mm0             \n\t" //common
 987 +               "pfadd %%mm7, %%mm1             \n\t" //common
 988 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
 989 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
 990 +               "movq  (%1, %%"REG_S"), %%mm4   \n\t"
 991 +               "movq  8(%1, %%"REG_S"), %%mm5  \n\t"
 992 +               "pfadd %%mm0, %%mm2             \n\t"
 993 +               "pfadd %%mm1, %%mm3             \n\t"
 994 +               "pfadd %%mm0, %%mm4             \n\t"
 995 +               "pfadd %%mm1, %%mm5             \n\t"
 996 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
 997 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
 998 +               "movq  %%mm4, (%1, %%"REG_S")   \n\t"
 999 +               "movq  %%mm5, 8(%1, %%"REG_S")  \n\t"
1000 +               "add $16, %%"REG_S"             \n\t"
1001 +               " jnz 1b                        \n\t"
1002 +       :: "r" (left+256), "r" (right+256), "m" (bias)
1003 +       : "%"REG_S
1004 +       );
1005 +}
1006 +
1007 +static void mix21toS_3dnow (sample_t * samples, sample_t bias)
1008 +{
1009 +       __asm__ volatile(
1010 +               "movd  %1, %%mm7        \n\t"
1011 +               "punpckldq %1, %%mm7    \n\t"
1012 +               "mov $-1024, %%"REG_S"  \n\t"
1013 +               ASMALIGN(4)
1014 +               "1:                     \n\t"
1015 +               "movq  2048(%0, %%"REG_S"), %%mm0\n\t"  // surround
1016 +               "movq  2056(%0, %%"REG_S"), %%mm1\n\t"  // surround
1017 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1018 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1019 +               "movq  1024(%0, %%"REG_S"), %%mm4\n\t"
1020 +               "movq  1032(%0, %%"REG_S"), %%mm5\n\t"
1021 +               "pfadd %%mm7, %%mm2             \n\t"
1022 +               "pfadd %%mm7, %%mm3             \n\t"
1023 +               "pfadd %%mm7, %%mm4             \n\t"
1024 +               "pfadd %%mm7, %%mm5             \n\t"
1025 +               "pfsub %%mm0, %%mm2             \n\t"
1026 +               "pfsub %%mm1, %%mm3             \n\t"
1027 +               "pfadd %%mm0, %%mm4             \n\t"
1028 +               "pfadd %%mm1, %%mm5             \n\t"
1029 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1030 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1031 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1032 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1033 +               "add $16, %%"REG_S"             \n\t"
1034 +               " jnz 1b                        \n\t"
1035 +       :: "r" (samples+256), "m" (bias)
1036 +       : "%"REG_S
1037 +       );
1038 +}
1039 +
1040 +static void mix31to2_3dnow (sample_t * samples, sample_t bias)
1041 +{
1042 +       __asm__ volatile(
1043 +               "movd  %1, %%mm7        \n\t"
1044 +               "punpckldq %1, %%mm7    \n\t"
1045 +               "mov $-1024, %%"REG_S"  \n\t"
1046 +               ASMALIGN(4)
1047 +               "1:                     \n\t"
1048 +               "movq  1024(%0, %%"REG_S"), %%mm0\n\t"
1049 +               "movq  1032(%0, %%"REG_S"), %%mm1\n\t"
1050 +               "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
1051 +               "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
1052 +               "pfadd %%mm7, %%mm0             \n\t" // common
1053 +               "pfadd %%mm7, %%mm1             \n\t" // common
1054 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1055 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1056 +               "movq  2048(%0, %%"REG_S"), %%mm4\n\t"
1057 +               "movq  2056(%0, %%"REG_S"), %%mm5\n\t"
1058 +               "pfadd %%mm0, %%mm2             \n\t"
1059 +               "pfadd %%mm1, %%mm3             \n\t"
1060 +               "pfadd %%mm0, %%mm4             \n\t"
1061 +               "pfadd %%mm1, %%mm5             \n\t"
1062 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1063 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1064 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1065 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1066 +               "add $16, %%"REG_S"             \n\t"
1067 +               " jnz 1b                        \n\t"
1068 +       :: "r" (samples+256), "m" (bias)
1069 +       : "%"REG_S
1070 +       );
1071 +}
1072 +
1073 +static void mix31toS_3dnow (sample_t * samples, sample_t bias)
1074 +{
1075 +       __asm__ volatile(
1076 +               "movd  %1, %%mm7        \n\t"
1077 +               "punpckldq %1, %%mm7    \n\t"
1078 +               "mov $-1024, %%"REG_S"  \n\t"
1079 +               ASMALIGN(4)
1080 +               "1:                     \n\t"
1081 +               "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
1082 +               "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
1083 +               "pfadd  %%mm7, %%mm0            \n\t" // common
1084 +               "pfadd  %%mm7, %%mm1            \n\t" // common
1085 +               "movq   (%0, %%"REG_S"), %%mm2  \n\t"
1086 +               "movq   8(%0, %%"REG_S"), %%mm3 \n\t"
1087 +               "movq   2048(%0, %%"REG_S"), %%mm4\n\t"
1088 +               "movq   2056(%0, %%"REG_S"), %%mm5\n\t"
1089 +               "pfadd  %%mm0, %%mm2            \n\t"
1090 +               "pfadd  %%mm1, %%mm3            \n\t"
1091 +               "pfadd  %%mm0, %%mm4            \n\t"
1092 +               "pfadd  %%mm1, %%mm5            \n\t"
1093 +               "movq   3072(%0, %%"REG_S"), %%mm0\n\t" // surround
1094 +               "movq   3080(%0, %%"REG_S"), %%mm1\n\t" // surround
1095 +               "pfsub  %%mm0, %%mm2            \n\t"
1096 +               "pfsub  %%mm1, %%mm3            \n\t"
1097 +               "pfadd  %%mm0, %%mm4            \n\t"
1098 +               "pfadd  %%mm1, %%mm5            \n\t"
1099 +               "movq   %%mm2, (%0, %%"REG_S")  \n\t"
1100 +               "movq   %%mm3, 8(%0, %%"REG_S") \n\t"
1101 +               "movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
1102 +               "movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
1103 +               "add $16, %%"REG_S"             \n\t"
1104 +               " jnz 1b                        \n\t"
1105 +       :: "r" (samples+256), "m" (bias)
1106 +       : "%"REG_S
1107 +       );
1108 +}
1109 +
1110 +static void mix22toS_3dnow (sample_t * samples, sample_t bias)
1111 +{
1112 +       __asm__ volatile(
1113 +               "movd  %1, %%mm7        \n\t"
1114 +               "punpckldq %1, %%mm7    \n\t"
1115 +               "mov $-1024, %%"REG_S"  \n\t"
1116 +               ASMALIGN(4)
1117 +               "1:                     \n\t"
1118 +               "movq  2048(%0, %%"REG_S"), %%mm0\n\t"
1119 +               "movq  2056(%0, %%"REG_S"), %%mm1\n\t"
1120 +               "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
1121 +               "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
1122 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1123 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1124 +               "movq  1024(%0, %%"REG_S"), %%mm4\n\t"
1125 +               "movq  1032(%0, %%"REG_S"), %%mm5\n\t"
1126 +               "pfadd %%mm7, %%mm2             \n\t"
1127 +               "pfadd %%mm7, %%mm3             \n\t"
1128 +               "pfadd %%mm7, %%mm4             \n\t"
1129 +               "pfadd %%mm7, %%mm5             \n\t"
1130 +               "pfsub %%mm0, %%mm2             \n\t"
1131 +               "pfsub %%mm1, %%mm3             \n\t"
1132 +               "pfadd %%mm0, %%mm4             \n\t"
1133 +               "pfadd %%mm1, %%mm5             \n\t"
1134 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1135 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1136 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1137 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1138 +               "add $16, %%"REG_S"             \n\t"
1139 +               " jnz 1b                        \n\t"
1140 +       :: "r" (samples+256), "m" (bias)
1141 +       : "%"REG_S
1142 +       );
1143 +}
1144 +
1145 +static void mix32to2_3dnow (sample_t * samples, sample_t bias)
1146 +{
1147 +       __asm__ volatile(
1148 +       "movd  %1, %%mm7        \n\t"
1149 +       "punpckldq %1, %%mm7    \n\t"
1150 +       "mov $-1024, %%"REG_S"  \n\t"
1151 +       ASMALIGN(4)
1152 +       "1:                     \n\t"
1153 +       "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
1154 +       "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
1155 +       "pfadd  %%mm7, %%mm0            \n\t" // common
1156 +       "pfadd  %%mm7, %%mm1            \n\t" // common
1157 +       "movq   %%mm0, %%mm2            \n\t" // common
1158 +       "movq   %%mm1, %%mm3            \n\t" // common
1159 +       "pfadd  (%0, %%"REG_S"), %%mm0  \n\t"
1160 +       "pfadd  8(%0, %%"REG_S"), %%mm1 \n\t"
1161 +       "pfadd  2048(%0, %%"REG_S"), %%mm2\n\t"
1162 +       "pfadd  2056(%0, %%"REG_S"), %%mm3\n\t"
1163 +       "pfadd  3072(%0, %%"REG_S"), %%mm0\n\t"
1164 +       "pfadd  3080(%0, %%"REG_S"), %%mm1\n\t"
1165 +       "pfadd  4096(%0, %%"REG_S"), %%mm2\n\t"
1166 +       "pfadd  4104(%0, %%"REG_S"), %%mm3\n\t"
1167 +       "movq   %%mm0, (%0, %%"REG_S")  \n\t"
1168 +       "movq   %%mm1, 8(%0, %%"REG_S") \n\t"
1169 +       "movq   %%mm2, 1024(%0, %%"REG_S")\n\t"
1170 +       "movq   %%mm3, 1032(%0, %%"REG_S")\n\t"
1171 +       "add $16, %%"REG_S"             \n\t"
1172 +       " jnz 1b                        \n\t"
1173 +       :: "r" (samples+256), "m" (bias)
1174 +       : "%"REG_S
1175 +       );
1176 +}
1177 +
1178 +/* todo: should be optimized better */
1179 +static void mix32toS_3dnow (sample_t * samples, sample_t bias)
1180 +{
1181 +       __asm__ volatile(
1182 +       "mov $-1024, %%"REG_S"          \n\t"
1183 +       ASMALIGN(4)
1184 +       "1:                     \n\t"
1185 +       "movd  %1, %%mm7                \n\t"
1186 +       "punpckldq %1, %%mm7            \n\t"
1187 +       "movq  1024(%0, %%"REG_S"), %%mm0\n\t"
1188 +       "movq  1032(%0, %%"REG_S"), %%mm1\n\t"
1189 +       "movq  3072(%0, %%"REG_S"), %%mm4\n\t"
1190 +       "movq  3080(%0, %%"REG_S"), %%mm5\n\t"
1191 +       "pfadd %%mm7, %%mm0             \n\t" // common
1192 +       "pfadd %%mm7, %%mm1             \n\t" // common
1193 +       "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround
1194 +       "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround
1195 +       "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1196 +       "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1197 +       "movq  2048(%0, %%"REG_S"), %%mm6\n\t"
1198 +       "movq  2056(%0, %%"REG_S"), %%mm7\n\t"
1199 +       "pfsub %%mm4, %%mm2             \n\t"
1200 +       "pfsub %%mm5, %%mm3             \n\t"
1201 +       "pfadd %%mm4, %%mm6             \n\t"
1202 +       "pfadd %%mm5, %%mm7             \n\t"
1203 +       "pfadd %%mm0, %%mm2             \n\t"
1204 +       "pfadd %%mm1, %%mm3             \n\t"
1205 +       "pfadd %%mm0, %%mm6             \n\t"
1206 +       "pfadd %%mm1, %%mm7             \n\t"
1207 +       "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1208 +       "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1209 +       "movq  %%mm6, 1024(%0, %%"REG_S")\n\t"
1210 +       "movq  %%mm7, 1032(%0, %%"REG_S")\n\t"
1211 +       "add $16, %%"REG_S"             \n\t"
1212 +       " jnz 1b                        \n\t"
1213 +       :: "r" (samples+256), "m" (bias)
1214 +       : "%"REG_S
1215 +       );
1216 +}
1217 +
1218 +static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias)
1219 +{
1220 +       __asm__ volatile(
1221 +               "movd  %2, %%mm7        \n\t"
1222 +               "punpckldq %2, %%mm7    \n\t"
1223 +               "mov $-1024, %%"REG_S"  \n\t"
1224 +               ASMALIGN(4)
1225 +               "1:                     \n\t"
1226 +               "movq  (%0, %%"REG_S"), %%mm0   \n\t"
1227 +               "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
1228 +               "movq  16(%0, %%"REG_S"), %%mm2 \n\t"
1229 +               "movq  24(%0, %%"REG_S"), %%mm3 \n\t"
1230 +               "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t"
1231 +               "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t"
1232 +               "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t"
1233 +               "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t"
1234 +               "pfadd %%mm7, %%mm0             \n\t"
1235 +               "pfadd %%mm7, %%mm1             \n\t"
1236 +               "pfadd %%mm7, %%mm2             \n\t"
1237 +               "pfadd %%mm7, %%mm3             \n\t"
1238 +               "movq  %%mm0, (%1, %%"REG_S")   \n\t"
1239 +               "movq  %%mm1, 8(%1, %%"REG_S")  \n\t"
1240 +               "movq  %%mm2, 16(%1, %%"REG_S") \n\t"
1241 +               "movq  %%mm3, 24(%1, %%"REG_S") \n\t"
1242 +               "add $32, %%"REG_S"             \n\t"
1243 +               " jnz 1b                        \n\t"
1244 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
1245 +       : "%"REG_S
1246 +       );
1247 +}
1248 +
1249 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
1250 +             sample_t clev, sample_t slev)
1251 +{
1252 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
1253 +
1254 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
1255 +       memcpy (samples, samples + 256, 256 * sizeof (sample_t));
1256 +       break;
1257 +
1258 +    case CONVERT (A52_CHANNEL, A52_MONO):
1259 +    case CONVERT (A52_STEREO, A52_MONO):
1260 +    mix_2to1_3dnow:
1261 +       mix2to1_3dnow (samples, samples + 256, bias);
1262 +       break;
1263 +
1264 +    case CONVERT (A52_2F1R, A52_MONO):
1265 +       if (slev == 0)
1266 +           goto mix_2to1_3dnow;
1267 +    case CONVERT (A52_3F, A52_MONO):
1268 +    mix_3to1_3dnow:
1269 +       mix3to1_3dnow (samples, bias);
1270 +       break;
1271 +
1272 +    case CONVERT (A52_3F1R, A52_MONO):
1273 +       if (slev == 0)
1274 +           goto mix_3to1_3dnow;
1275 +    case CONVERT (A52_2F2R, A52_MONO):
1276 +       if (slev == 0)
1277 +           goto mix_2to1_3dnow;
1278 +       mix4to1_3dnow (samples, bias);
1279 +       break;
1280 +
1281 +    case CONVERT (A52_3F2R, A52_MONO):
1282 +       if (slev == 0)
1283 +           goto mix_3to1_3dnow;
1284 +       mix5to1_3dnow (samples, bias);
1285 +       break;
1286 +
1287 +    case CONVERT (A52_MONO, A52_DOLBY):
1288 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
1289 +       break;
1290 +
1291 +    case CONVERT (A52_3F, A52_STEREO):
1292 +    case CONVERT (A52_3F, A52_DOLBY):
1293 +    mix_3to2_3dnow:
1294 +       mix3to2_3dnow (samples, bias);
1295 +       break;
1296 +
1297 +    case CONVERT (A52_2F1R, A52_STEREO):
1298 +       if (slev == 0)
1299 +           break;
1300 +       mix21to2_3dnow (samples, samples + 256, bias);
1301 +       break;
1302 +
1303 +    case CONVERT (A52_2F1R, A52_DOLBY):
1304 +       mix21toS_3dnow (samples, bias);
1305 +       break;
1306 +
1307 +    case CONVERT (A52_3F1R, A52_STEREO):
1308 +       if (slev == 0)
1309 +           goto mix_3to2_3dnow;
1310 +       mix31to2_3dnow (samples, bias);
1311 +       break;
1312 +
1313 +    case CONVERT (A52_3F1R, A52_DOLBY):
1314 +       mix31toS_3dnow (samples, bias);
1315 +       break;
1316 +
1317 +    case CONVERT (A52_2F2R, A52_STEREO):
1318 +       if (slev == 0)
1319 +           break;
1320 +       mix2to1_3dnow (samples, samples + 512, bias);
1321 +       mix2to1_3dnow (samples + 256, samples + 768, bias);
1322 +       break;
1323 +
1324 +    case CONVERT (A52_2F2R, A52_DOLBY):
1325 +       mix22toS_3dnow (samples, bias);
1326 +       break;
1327 +
1328 +    case CONVERT (A52_3F2R, A52_STEREO):
1329 +       if (slev == 0)
1330 +           goto mix_3to2_3dnow;
1331 +       mix32to2_3dnow (samples, bias);
1332 +       break;
1333 +
1334 +    case CONVERT (A52_3F2R, A52_DOLBY):
1335 +       mix32toS_3dnow (samples, bias);
1336 +       break;
1337 +
1338 +    case CONVERT (A52_3F1R, A52_3F):
1339 +       if (slev == 0)
1340 +           break;
1341 +       mix21to2_3dnow (samples, samples + 512, bias);
1342 +       break;
1343 +
1344 +    case CONVERT (A52_3F2R, A52_3F):
1345 +       if (slev == 0)
1346 +           break;
1347 +       mix2to1_3dnow (samples, samples + 768, bias);
1348 +       mix2to1_3dnow (samples + 512, samples + 1024, bias);
1349 +       break;
1350 +
1351 +    case CONVERT (A52_3F1R, A52_2F1R):
1352 +       mix3to2_3dnow (samples, bias);
1353 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1354 +       break;
1355 +
1356 +    case CONVERT (A52_2F2R, A52_2F1R):
1357 +       mix2to1_3dnow (samples + 512, samples + 768, bias);
1358 +       break;
1359 +
1360 +    case CONVERT (A52_3F2R, A52_2F1R):
1361 +       mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
1362 +       move2to1_3dnow (samples + 768, samples + 512, bias);
1363 +       break;
1364 +
1365 +    case CONVERT (A52_3F2R, A52_3F1R):
1366 +       mix2to1_3dnow (samples + 768, samples + 1024, bias);
1367 +       break;
1368 +
1369 +    case CONVERT (A52_2F1R, A52_2F2R):
1370 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
1371 +       break;
1372 +
1373 +    case CONVERT (A52_3F1R, A52_2F2R):
1374 +       mix3to2_3dnow (samples, bias);
1375 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1376 +       break;
1377 +
1378 +    case CONVERT (A52_3F2R, A52_2F2R):
1379 +       mix3to2_3dnow (samples, bias);
1380 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1381 +       memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
1382 +       break;
1383 +
1384 +    case CONVERT (A52_3F1R, A52_3F2R):
1385 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
1386 +       break;
1387 +    }
1388 +    __asm__ volatile("femms":::"memory");
1389 +}
1390 +
1391 +#endif // ARCH_X86 || ARCH_X86_64
1392 --- liba52/imdct.c      2008-02-19 00:18:33.000000000 +0100
1393 +++ liba52/imdct.c      2008-02-19 00:16:40.000000000 +0100
1394 @@ -22,6 +26,11 @@
1395   * You should have received a copy of the GNU General Public License
1396   * along with this program; if not, write to the Free Software
1397   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
1398 + *
1399 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
1400 + * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
1401 + *   michael did port them from libac3 (untested, perhaps totally broken)
1402 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
1403   */
1404
1405  #include "config.h"
1406 @@ -39,12 +48,50 @@
1407  #include "a52.h"
1408  #include "a52_internal.h"
1409  #include "mm_accel.h"
1410 +#include "mangle.h"
1411 +
1412 +void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
1413 +
1414 +#if CONFIG_RUNTIME_CPUDETECT
1415 +#undef HAVE_AMD3DNOWEXT
1416 +#define HAVE_AMD3DNOWEXT 0
1417 +#endif
1418
1419  typedef struct complex_s {
1420      sample_t real;
1421      sample_t imag;
1422  } complex_t;
1423
1424 +static const int pm128[128] attribute_used __attribute__((aligned(16))) =
1425 +{
1426 +       0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
1427 +       4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
1428 +       2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
1429 +       6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
1430 +       1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
1431 +       5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
1432 +       3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
1433 +       7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
1434 +};
1435 +
1436 +static uint8_t attribute_used bit_reverse_512[] = {
1437 +       0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
1438 +       0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
1439 +       0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
1440 +       0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
1441 +       0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
1442 +       0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
1443 +       0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
1444 +       0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
1445 +       0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
1446 +       0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
1447 +       0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
1448 +       0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
1449 +       0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
1450 +       0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
1451 +       0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
1452 +       0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
1453 +
1454  static uint8_t fftorder[] = {
1455        0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
1456        8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
1457 @@ -56,6 +103,40 @@
1458        6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
1459  };
1460
1461 +static complex_t __attribute__((aligned(16))) buf[128];
1462 +
1463 +/* Twiddle factor LUT */
1464 +static complex_t __attribute__((aligned(16))) w_1[1];
1465 +static complex_t __attribute__((aligned(16))) w_2[2];
1466 +static complex_t __attribute__((aligned(16))) w_4[4];
1467 +static complex_t __attribute__((aligned(16))) w_8[8];
1468 +static complex_t __attribute__((aligned(16))) w_16[16];
1469 +static complex_t __attribute__((aligned(16))) w_32[32];
1470 +static complex_t __attribute__((aligned(16))) w_64[64];
1471 +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
1472 +
1473 +/* Twiddle factors for IMDCT */
1474 +static sample_t __attribute__((aligned(16))) xcos1[128];
1475 +static sample_t __attribute__((aligned(16))) xsin1[128];
1476 +
1477 +#if ARCH_X86 || ARCH_X86_64
1478 +// NOTE: SSE needs 16byte alignment or it will segfault
1479 +//
1480 +static float __attribute__((aligned(16))) sseSinCos1c[256];
1481 +static float __attribute__((aligned(16))) sseSinCos1d[256];
1482 +static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
1483 +//static float __attribute__((aligned(16))) sseW0[4];
1484 +static float __attribute__((aligned(16))) sseW1[8];
1485 +static float __attribute__((aligned(16))) sseW2[16];
1486 +static float __attribute__((aligned(16))) sseW3[32];
1487 +static float __attribute__((aligned(16))) sseW4[64];
1488 +static float __attribute__((aligned(16))) sseW5[128];
1489 +static float __attribute__((aligned(16))) sseW6[256];
1490 +static float __attribute__((aligned(16))) *sseW[7]=
1491 +       {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
1492 +static float __attribute__((aligned(16))) sseWindow[512];
1493 +#endif
1494 +
1495  /* Root values for IFFT */
1496  static sample_t roots16[3];
1497  static sample_t roots32[7];
1498 @@ -241,7 +322,7 @@
1499      ifft_pass (buf, roots128 - 32, 32);
1500  }
1501
1502 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
1503 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
1504  {
1505      int i, k;
1506      sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
1507 @@ -285,6 +366,702 @@
1508      }
1509  }
1510
1511 +#if HAVE_ALTIVEC
1512 +
1513 +#ifdef HAVE_ALTIVEC_H
1514 +#include <altivec.h>
1515 +#endif
1516 +
1517 +// used to build registers permutation vectors (vcprm)
1518 +// the 's' are for words in the _s_econd vector
1519 +#define WORD_0 0x00,0x01,0x02,0x03
1520 +#define WORD_1 0x04,0x05,0x06,0x07
1521 +#define WORD_2 0x08,0x09,0x0a,0x0b
1522 +#define WORD_3 0x0c,0x0d,0x0e,0x0f
1523 +#define WORD_s0 0x10,0x11,0x12,0x13
1524 +#define WORD_s1 0x14,0x15,0x16,0x17
1525 +#define WORD_s2 0x18,0x19,0x1a,0x1b
1526 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f
1527 +
1528 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
1529 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
1530 +
1531 +#define FOUROF(a) {a,a,a,a}
1532 +
1533 +// vcprmle is used to keep the same index as in the SSE version.
1534 +// it's the same as vcprm, with the index inversed
1535 +// ('le' is Little Endian)
1536 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
1537 +
1538 +// used to build inverse/identity vectors (vcii)
1539 +// n is _n_egative, p is _p_ositive
1540 +#define FLOAT_n -1.
1541 +#define FLOAT_p 1.
1542 +
1543 +
1544 +void
1545 +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
1546 +{
1547 +  int i;
1548 +  int k;
1549 +  int p,q;
1550 +  int m;
1551 +  long two_m;
1552 +  long two_m_plus_one;
1553 +
1554 +  sample_t tmp_b_i;
1555 +  sample_t tmp_b_r;
1556 +  sample_t tmp_a_i;
1557 +  sample_t tmp_a_r;
1558 +
1559 +  sample_t *data_ptr;
1560 +  sample_t *delay_ptr;
1561 +  sample_t *window_ptr;
1562 +
1563 +  /* 512 IMDCT with source and dest data in 'data' */
1564 +
1565 +  /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
1566 +  for( i=0; i < 128; i++) {
1567 +    /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
1568 +    int j= bit_reverse_512[i];
1569 +    buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
1570 +    buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
1571 +  }
1572 +
1573 +  /* 1. iteration */
1574 +  for(i = 0; i < 128; i += 2) {
1575 +#if 0
1576 +    tmp_a_r = buf[i].real;
1577 +    tmp_a_i = buf[i].imag;
1578 +    tmp_b_r = buf[i+1].real;
1579 +    tmp_b_i = buf[i+1].imag;
1580 +    buf[i].real = tmp_a_r + tmp_b_r;
1581 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1582 +    buf[i+1].real = tmp_a_r - tmp_b_r;
1583 +    buf[i+1].imag =  tmp_a_i - tmp_b_i;
1584 +#else
1585 +    vector float temp, bufv;
1586 +
1587 +    bufv = vec_ld(i << 3, (float*)buf);
1588 +    temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
1589 +    bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
1590 +    vec_st(bufv, i << 3, (float*)buf);
1591 +#endif
1592 +  }
1593 +
1594 +  /* 2. iteration */
1595 +  // Note w[1]={{1,0}, {0,-1}}
1596 +  for(i = 0; i < 128; i += 4) {
1597 +#if 0
1598 +    tmp_a_r = buf[i].real;
1599 +    tmp_a_i = buf[i].imag;
1600 +    tmp_b_r = buf[i+2].real;
1601 +    tmp_b_i = buf[i+2].imag;
1602 +    buf[i].real = tmp_a_r + tmp_b_r;
1603 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1604 +    buf[i+2].real = tmp_a_r - tmp_b_r;
1605 +    buf[i+2].imag =  tmp_a_i - tmp_b_i;
1606 +    tmp_a_r = buf[i+1].real;
1607 +    tmp_a_i = buf[i+1].imag;
1608 +    /* WARNING: im <-> re here ! */
1609 +    tmp_b_r = buf[i+3].imag;
1610 +    tmp_b_i = buf[i+3].real;
1611 +    buf[i+1].real = tmp_a_r + tmp_b_r;
1612 +    buf[i+1].imag =  tmp_a_i - tmp_b_i;
1613 +    buf[i+3].real = tmp_a_r - tmp_b_r;
1614 +    buf[i+3].imag =  tmp_a_i + tmp_b_i;
1615 +#else
1616 +    vector float buf01, buf23, temp1, temp2;
1617 +
1618 +    buf01 = vec_ld((i + 0) << 3, (float*)buf);
1619 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1620 +    buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
1621 +
1622 +    temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
1623 +    temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
1624 +
1625 +    vec_st(temp1, (i + 0) << 3, (float*)buf);
1626 +    vec_st(temp2, (i + 2) << 3, (float*)buf);
1627 +#endif
1628 +  }
1629 +
1630 +  /* 3. iteration */
1631 +  for(i = 0; i < 128; i += 8) {
1632 +#if 0
1633 +    tmp_a_r = buf[i].real;
1634 +    tmp_a_i = buf[i].imag;
1635 +    tmp_b_r = buf[i+4].real;
1636 +    tmp_b_i = buf[i+4].imag;
1637 +    buf[i].real = tmp_a_r + tmp_b_r;
1638 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1639 +    buf[i+4].real = tmp_a_r - tmp_b_r;
1640 +    buf[i+4].imag =  tmp_a_i - tmp_b_i;
1641 +    tmp_a_r = buf[1+i].real;
1642 +    tmp_a_i = buf[1+i].imag;
1643 +    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
1644 +    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
1645 +    buf[1+i].real = tmp_a_r + tmp_b_r;
1646 +    buf[1+i].imag =  tmp_a_i + tmp_b_i;
1647 +    buf[i+5].real = tmp_a_r - tmp_b_r;
1648 +    buf[i+5].imag =  tmp_a_i - tmp_b_i;
1649 +    tmp_a_r = buf[i+2].real;
1650 +    tmp_a_i = buf[i+2].imag;
1651 +    /* WARNING re <-> im & sign */
1652 +    tmp_b_r = buf[i+6].imag;
1653 +    tmp_b_i = - buf[i+6].real;
1654 +    buf[i+2].real = tmp_a_r + tmp_b_r;
1655 +    buf[i+2].imag =  tmp_a_i + tmp_b_i;
1656 +    buf[i+6].real = tmp_a_r - tmp_b_r;
1657 +    buf[i+6].imag =  tmp_a_i - tmp_b_i;
1658 +    tmp_a_r = buf[i+3].real;
1659 +    tmp_a_i = buf[i+3].imag;
1660 +    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
1661 +    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
1662 +    buf[i+3].real = tmp_a_r + tmp_b_r;
1663 +    buf[i+3].imag =  tmp_a_i + tmp_b_i;
1664 +    buf[i+7].real = tmp_a_r - tmp_b_r;
1665 +    buf[i+7].imag =  tmp_a_i - tmp_b_i;
1666 +#else
1667 +    vector float buf01, buf23, buf45, buf67;
1668 +
1669 +    buf01 = vec_ld((i + 0) << 3, (float*)buf);
1670 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1671 +
1672 +    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
1673 +    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
1674 +    buf[i+5].real = tmp_b_r;
1675 +    buf[i+5].imag = tmp_b_i;
1676 +    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
1677 +    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
1678 +    buf[i+7].real = tmp_b_r;
1679 +    buf[i+7].imag = tmp_b_i;
1680 +
1681 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1682 +    buf45 = vec_ld((i + 4) << 3, (float*)buf);
1683 +    buf67 = vec_ld((i + 6) << 3, (float*)buf);
1684 +    buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
1685 +
1686 +    vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
1687 +    vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
1688 +    vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
1689 +    vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
1690 +#endif
1691 +  }
1692 +
1693 +  /* 4-7. iterations */
1694 +  for (m=3; m < 7; m++) {
1695 +    two_m = (1 << m);
1696 +
1697 +    two_m_plus_one = two_m<<1;
1698 +
1699 +    for(i = 0; i < 128; i += two_m_plus_one) {
1700 +      for(k = 0; k < two_m; k+=2) {
1701 +#if 0
1702 +        int p = k + i;
1703 +        int q = p + two_m;
1704 +        tmp_a_r = buf[p].real;
1705 +        tmp_a_i = buf[p].imag;
1706 +        tmp_b_r =
1707 +          buf[q].real * w[m][k].real -
1708 +          buf[q].imag * w[m][k].imag;
1709 +        tmp_b_i =
1710 +          buf[q].imag * w[m][k].real +
1711 +          buf[q].real * w[m][k].imag;
1712 +        buf[p].real = tmp_a_r + tmp_b_r;
1713 +        buf[p].imag =  tmp_a_i + tmp_b_i;
1714 +        buf[q].real = tmp_a_r - tmp_b_r;
1715 +        buf[q].imag =  tmp_a_i - tmp_b_i;
1716 +
1717 +        tmp_a_r = buf[(p + 1)].real;
1718 +        tmp_a_i = buf[(p + 1)].imag;
1719 +        tmp_b_r =
1720 +          buf[(q + 1)].real * w[m][(k + 1)].real -
1721 +          buf[(q + 1)].imag * w[m][(k + 1)].imag;
1722 +        tmp_b_i =
1723 +          buf[(q + 1)].imag * w[m][(k + 1)].real +
1724 +          buf[(q + 1)].real * w[m][(k + 1)].imag;
1725 +        buf[(p + 1)].real = tmp_a_r + tmp_b_r;
1726 +        buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
1727 +        buf[(q + 1)].real = tmp_a_r - tmp_b_r;
1728 +        buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
1729 +#else
1730 +        int p = k + i;
1731 +        int q = p + two_m;
1732 +        vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
1733 +        const vector float vczero = (const vector float)FOUROF(0.);
1734 +        // first compute buf[q] and buf[q+1]
1735 +        vecq = vec_ld(q << 3, (float*)buf);
1736 +        vecw = vec_ld(0, (float*)&(w[m][k]));
1737 +        temp1 = vec_madd(vecq, vecw, vczero);
1738 +        temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
1739 +        temp2 = vec_madd(temp2, vecw, vczero);
1740 +        temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
1741 +        temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
1742 +        vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
1743 +        // then butterfly with buf[p] and buf[p+1]
1744 +        vecp = vec_ld(p << 3, (float*)buf);
1745 +
1746 +        temp1 = vec_add(vecp, vecq);
1747 +        temp2 = vec_sub(vecp, vecq);
1748 +
1749 +        vec_st(temp1, p << 3, (float*)buf);
1750 +        vec_st(temp2, q << 3, (float*)buf);
1751 +#endif
1752 +      }
1753 +    }
1754 +  }
1755 +
1756 +  /* Post IFFT complex multiply  plus IFFT complex conjugate*/
1757 +  for( i=0; i < 128; i+=4) {
1758 +    /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
1759 +#if 0
1760 +    tmp_a_r =        buf[(i + 0)].real;
1761 +    tmp_a_i = -1.0 * buf[(i + 0)].imag;
1762 +    buf[(i + 0)].real =
1763 +      (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
1764 +    buf[(i + 0)].imag =
1765 +      (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
1766 +
1767 +    tmp_a_r =        buf[(i + 1)].real;
1768 +    tmp_a_i = -1.0 * buf[(i + 1)].imag;
1769 +    buf[(i + 1)].real =
1770 +      (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
1771 +    buf[(i + 1)].imag =
1772 +      (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
1773 +
1774 +    tmp_a_r =        buf[(i + 2)].real;
1775 +    tmp_a_i = -1.0 * buf[(i + 2)].imag;
1776 +    buf[(i + 2)].real =
1777 +      (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
1778 +    buf[(i + 2)].imag =
1779 +      (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
1780 +
1781 +    tmp_a_r =        buf[(i + 3)].real;
1782 +    tmp_a_i = -1.0 * buf[(i + 3)].imag;
1783 +    buf[(i + 3)].real =
1784 +      (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
1785 +    buf[(i + 3)].imag =
1786 +      (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
1787 +#else
1788 +    vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
1789 +    vector float temp0022, temp1133, tempCS01;
1790 +    const vector float vczero = (const vector float)FOUROF(0.);
1791 +
1792 +    bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
1793 +    bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
1794 +
1795 +    cosv = vec_ld(i << 2, xcos1);
1796 +    sinv = vec_ld(i << 2, xsin1);
1797 +
1798 +    temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
1799 +    temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
1800 +    tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
1801 +    temp1 = vec_madd(temp0022, tempCS01, vczero);
1802 +    tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
1803 +    temp2 = vec_madd(temp1133, tempCS01, vczero);
1804 +    bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
1805 +
1806 +    vec_st(bufv_0, (i + 0) << 3, (float*)buf);
1807 +
1808 +    /* idem with bufv_2 and high-order cosv/sinv */
1809 +
1810 +    temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
1811 +    temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
1812 +    tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
1813 +    temp1 = vec_madd(temp0022, tempCS01, vczero);
1814 +    tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
1815 +    temp2 = vec_madd(temp1133, tempCS01, vczero);
1816 +    bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
1817 +
1818 +    vec_st(bufv_2, (i + 2) << 3, (float*)buf);
1819 +
1820 +#endif
1821 +  }
1822 +
1823 +  data_ptr = data;
1824 +  delay_ptr = delay;
1825 +  window_ptr = a52_imdct_window;
1826 +
1827 +  /* Window and convert to real valued signal */
1828 +  for(i=0; i< 64; i++) {
1829 +    *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
1830 +    *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1831 +  }
1832 +
1833 +  for(i=0; i< 64; i++) {
1834 +    *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
1835 +    *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1836 +  }
1837 +
1838 +  /* The trailing edge of the window goes into the delay line */
1839 +  delay_ptr = delay;
1840 +
1841 +  for(i=0; i< 64; i++) {
1842 +    *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
1843 +    *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
1844 +  }
1845 +
1846 +  for(i=0; i<64; i++) {
1847 +    *delay_ptr++  =  buf[i].imag       * *--window_ptr;
1848 +    *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
1849 +  }
1850 +}
1851 +#endif
1852 +
1853 +
1854 +// Stuff below this line is borrowed from libac3
1855 +#include "srfftp.h"
1856 +#if ARCH_X86 || ARCH_X86_64
1857 +#undef HAVE_AMD3DNOW
1858 +#define HAVE_AMD3DNOW 1
1859 +#include "srfftp_3dnow.h"
1860 +
1861 +const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
1862 +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
1863 +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
1864 +
1865 +#undef HAVE_AMD3DNOWEXT
1866 +#define HAVE_AMD3DNOWEXT 0
1867 +#include "imdct_3dnow.h"
1868 +#undef HAVE_AMD3DNOWEXT
1869 +#define HAVE_AMD3DNOWEXT 1
1870 +#include "imdct_3dnow.h"
1871 +
1872 +void
1873 +imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
1874 +{
1875 +/*     int i,k;
1876 +    int p,q;*/
1877 +    int m;
1878 +    long two_m;
1879 +    long two_m_plus_one;
1880 +    long two_m_plus_one_shl3;
1881 +    complex_t *buf_offset;
1882 +
1883 +/*  sample_t tmp_a_i;
1884 +    sample_t tmp_a_r;
1885 +    sample_t tmp_b_i;
1886 +    sample_t tmp_b_r;*/
1887 +
1888 +    sample_t *data_ptr;
1889 +    sample_t *delay_ptr;
1890 +    sample_t *window_ptr;
1891 +
1892 +    /* 512 IMDCT with source and dest data in 'data' */
1893 +    /* see the c version (dct_do_512()), its allmost identical, just in C */
1894 +
1895 +    /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1896 +    /* Bit reversed shuffling */
1897 +       __asm__ volatile(
1898 +               "xor %%"REG_S", %%"REG_S"               \n\t"
1899 +               "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
1900 +               "mov $1008, %%"REG_D"                   \n\t"
1901 +               "push %%"REG_BP"                        \n\t" //use ebp without telling gcc
1902 +               ASMALIGN(4)
1903 +               "1:                                     \n\t"
1904 +               "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
1905 +               "movhps 8(%0, %%"REG_D"), %%xmm0        \n\t" // RXXI
1906 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // XXXi
1907 +               "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
1908 +               "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
1909 +               "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
1910 +               "mulps %%xmm0, %%xmm2                   \n\t"
1911 +               "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
1912 +               "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
1913 +               "subps %%xmm0, %%xmm2                   \n\t"
1914 +               "movzb (%%"REG_a"), %%"REG_d"           \n\t"
1915 +               "movzb 1(%%"REG_a"), %%"REG_BP"         \n\t"
1916 +               "movlps %%xmm2, (%1, %%"REG_d", 8)      \n\t"
1917 +               "movhps %%xmm2, (%1, %%"REG_BP", 8)     \n\t"
1918 +               "add $16, %%"REG_S"                     \n\t"
1919 +               "add $2, %%"REG_a"                      \n\t" // avoid complex addressing for P4 crap
1920 +               "sub $16, %%"REG_D"                     \n\t"
1921 +               "jnc 1b                                 \n\t"
1922 +               "pop %%"REG_BP"                         \n\t"//no we didnt touch ebp *g*
1923 +               :: "b" (data), "c" (buf)
1924 +               : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
1925 +       );
1926 +
1927 +
1928 +    /* FFT Merge */
1929 +/* unoptimized variant
1930 +    for (m=1; m < 7; m++) {
1931 +       if(m)
1932 +           two_m = (1 << m);
1933 +       else
1934 +           two_m = 1;
1935 +
1936 +       two_m_plus_one = (1 << (m+1));
1937 +
1938 +       for(i = 0; i < 128; i += two_m_plus_one) {
1939 +           for(k = 0; k < two_m; k++) {
1940 +               p = k + i;
1941 +               q = p + two_m;
1942 +               tmp_a_r = buf[p].real;
1943 +               tmp_a_i = buf[p].imag;
1944 +               tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
1945 +               tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
1946 +               buf[p].real = tmp_a_r + tmp_b_r;
1947 +               buf[p].imag =  tmp_a_i + tmp_b_i;
1948 +               buf[q].real = tmp_a_r - tmp_b_r;
1949 +               buf[q].imag =  tmp_a_i - tmp_b_i;
1950 +           }
1951 +       }
1952 +    }
1953 +*/
1954 +
1955 +    /* 1. iteration */
1956 +       // Note w[0][0]={1,0}
1957 +       __asm__ volatile(
1958 +               "xorps %%xmm1, %%xmm1   \n\t"
1959 +               "xorps %%xmm2, %%xmm2   \n\t"
1960 +               "mov %0, %%"REG_S"      \n\t"
1961 +               ASMALIGN(4)
1962 +               "1:                     \n\t"
1963 +               "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
1964 +               "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
1965 +               "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
1966 +               "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
1967 +               "addps %%xmm1, %%xmm0   \n\t"
1968 +               "subps %%xmm2, %%xmm0   \n\t"
1969 +               "movaps %%xmm0, (%%"REG_S")\n\t"
1970 +               "add $16, %%"REG_S"     \n\t"
1971 +               "cmp %1, %%"REG_S"      \n\t"
1972 +               " jb 1b                 \n\t"
1973 +               :: "g" (buf), "r" (buf + 128)
1974 +               : "%"REG_S
1975 +       );
1976 +
1977 +    /* 2. iteration */
1978 +       // Note w[1]={{1,0}, {0,-1}}
1979 +       __asm__ volatile(
1980 +               "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
1981 +               "mov %0, %%"REG_S"              \n\t"
1982 +               ASMALIGN(4)
1983 +               "1:                             \n\t"
1984 +               "movaps 16(%%"REG_S"), %%xmm2   \n\t" //r2,i2,r3,i3
1985 +               "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
1986 +               "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
1987 +               "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
1988 +               "movaps (%%"REG_S"), %%xmm1     \n\t" //r0,i0,r1,i1
1989 +               "addps %%xmm2, %%xmm0           \n\t"
1990 +               "subps %%xmm2, %%xmm1           \n\t"
1991 +               "movaps %%xmm0, (%%"REG_S")     \n\t"
1992 +               "movaps %%xmm1, 16(%%"REG_S")   \n\t"
1993 +               "add $32, %%"REG_S"     \n\t"
1994 +               "cmp %1, %%"REG_S"      \n\t"
1995 +               " jb 1b                 \n\t"
1996 +               :: "g" (buf), "r" (buf + 128)
1997 +               : "%"REG_S
1998 +       );
1999 +
2000 +    /* 3. iteration */
2001 +/*
2002 + Note sseW2+0={1,1,sqrt(2),sqrt(2))
2003 + Note sseW2+16={0,0,sqrt(2),-sqrt(2))
2004 + Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
2005 + Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
2006 +*/
2007 +       __asm__ volatile(
2008 +               "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2009 +               "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
2010 +               "xorps %%xmm5, %%xmm5           \n\t"
2011 +               "xorps %%xmm2, %%xmm2           \n\t"
2012 +               "mov %0, %%"REG_S"              \n\t"
2013 +               ASMALIGN(4)
2014 +               "1:                             \n\t"
2015 +               "movaps 32(%%"REG_S"), %%xmm2   \n\t" //r4,i4,r5,i5
2016 +               "movaps 48(%%"REG_S"), %%xmm3   \n\t" //r6,i6,r7,i7
2017 +               "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2018 +               "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
2019 +               "mulps %%xmm2, %%xmm4           \n\t"
2020 +               "mulps %%xmm3, %%xmm5           \n\t"
2021 +               "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
2022 +               "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
2023 +               "mulps %%xmm6, %%xmm3           \n\t"
2024 +               "mulps %%xmm7, %%xmm2           \n\t"
2025 +               "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
2026 +               "movaps 16(%%"REG_S"), %%xmm1   \n\t" //r2,i2,r3,i3
2027 +               "addps %%xmm4, %%xmm2           \n\t"
2028 +               "addps %%xmm5, %%xmm3           \n\t"
2029 +               "movaps %%xmm2, %%xmm4          \n\t"
2030 +               "movaps %%xmm3, %%xmm5          \n\t"
2031 +               "addps %%xmm0, %%xmm2           \n\t"
2032 +               "addps %%xmm1, %%xmm3           \n\t"
2033 +               "subps %%xmm4, %%xmm0           \n\t"
2034 +               "subps %%xmm5, %%xmm1           \n\t"
2035 +               "movaps %%xmm2, (%%"REG_S")     \n\t"
2036 +               "movaps %%xmm3, 16(%%"REG_S")   \n\t"
2037 +               "movaps %%xmm0, 32(%%"REG_S")   \n\t"
2038 +               "movaps %%xmm1, 48(%%"REG_S")   \n\t"
2039 +               "add $64, %%"REG_S"     \n\t"
2040 +               "cmp %1, %%"REG_S"      \n\t"
2041 +               " jb 1b                 \n\t"
2042 +               :: "g" (buf), "r" (buf + 128)
2043 +               : "%"REG_S
2044 +       );
2045 +
2046 +    /* 4-7. iterations */
2047 +    for (m=3; m < 7; m++) {
2048 +       two_m = (1 << m);
2049 +       two_m_plus_one = two_m<<1;
2050 +       two_m_plus_one_shl3 = (two_m_plus_one<<3);
2051 +       buf_offset = buf+128;
2052 +       __asm__ volatile(
2053 +               "mov %0, %%"REG_S"                      \n\t"
2054 +               ASMALIGN(4)
2055 +               "1:                                     \n\t"
2056 +               "xor %%"REG_D", %%"REG_D"               \n\t" // k
2057 +               "lea (%%"REG_S", %3), %%"REG_d"         \n\t"
2058 +               "2:                                     \n\t"
2059 +               "movaps (%%"REG_d", %%"REG_D"), %%xmm1  \n\t"
2060 +               "movaps (%4, %%"REG_D", 2), %%xmm2      \n\t"
2061 +               "mulps %%xmm1, %%xmm2                   \n\t"
2062 +               "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
2063 +               "mulps 16(%4, %%"REG_D", 2), %%xmm1     \n\t"
2064 +               "movaps (%%"REG_S", %%"REG_D"), %%xmm0  \n\t"
2065 +               "addps %%xmm2, %%xmm1                   \n\t"
2066 +               "movaps %%xmm1, %%xmm2                  \n\t"
2067 +               "addps %%xmm0, %%xmm1                   \n\t"
2068 +               "subps %%xmm2, %%xmm0                   \n\t"
2069 +               "movaps %%xmm1, (%%"REG_S", %%"REG_D")  \n\t"
2070 +               "movaps %%xmm0, (%%"REG_d", %%"REG_D")  \n\t"
2071 +               "add $16, %%"REG_D"                     \n\t"
2072 +               "cmp %3, %%"REG_D"                      \n\t" //FIXME (opt) count against 0
2073 +               "jb 2b                                  \n\t"
2074 +               "add %2, %%"REG_S"                      \n\t"
2075 +               "cmp %1, %%"REG_S"                      \n\t"
2076 +               " jb 1b                                 \n\t"
2077 +               :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
2078 +                  "r" (sseW[m])
2079 +               : "%"REG_S, "%"REG_D, "%"REG_d
2080 +       );
2081 +    }
2082 +
2083 +    /* Post IFFT complex multiply  plus IFFT complex conjugate*/
2084 +       __asm__ volatile(
2085 +               "mov $-1024, %%"REG_S"                  \n\t"
2086 +               ASMALIGN(4)
2087 +               "1:                                     \n\t"
2088 +               "movaps (%0, %%"REG_S"), %%xmm0         \n\t"
2089 +               "movaps (%0, %%"REG_S"), %%xmm1         \n\t"
2090 +               "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
2091 +               "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
2092 +               "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
2093 +               "addps %%xmm1, %%xmm0                   \n\t"
2094 +               "movaps %%xmm0, (%0, %%"REG_S")         \n\t"
2095 +               "add $16, %%"REG_S"                     \n\t"
2096 +               " jnz 1b                                \n\t"
2097 +               :: "r" (buf+128)
2098 +               : "%"REG_S
2099 +       );
2100 +
2101 +
2102 +    data_ptr = data;
2103 +    delay_ptr = delay;
2104 +    window_ptr = a52_imdct_window;
2105 +
2106 +    /* Window and convert to real valued signal */
2107 +       __asm__ volatile(
2108 +               "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
2109 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2110 +               "movss %3, %%xmm2                       \n\t"  // bias
2111 +               "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
2112 +               ASMALIGN(4)
2113 +               "1:                                     \n\t"
2114 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? A ?
2115 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
2116 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
2117 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
2118 +               "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
2119 +               "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2120 +               "addps (%2, %%"REG_S"), %%xmm0          \n\t"
2121 +               "addps %%xmm2, %%xmm0                   \n\t"
2122 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2123 +               "add  $16, %%"REG_S"                    \n\t"
2124 +               "sub  $16, %%"REG_D"                    \n\t"
2125 +               "cmp  $512, %%"REG_S"                   \n\t"
2126 +               " jb 1b                                 \n\t"
2127 +               :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
2128 +               : "%"REG_S, "%"REG_D
2129 +       );
2130 +       data_ptr+=128;
2131 +       delay_ptr+=128;
2132 +//     window_ptr+=128;
2133 +
2134 +       __asm__ volatile(
2135 +               "mov $1024, %%"REG_D"                   \n\t"  // 512
2136 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2137 +               "movss %3, %%xmm2                       \n\t"  // bias
2138 +               "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
2139 +               ASMALIGN(4)
2140 +               "1:                                     \n\t"
2141 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
2142 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
2143 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
2144 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
2145 +               "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
2146 +               "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2147 +               "addps (%2, %%"REG_S"), %%xmm0          \n\t"
2148 +               "addps %%xmm2, %%xmm0                   \n\t"
2149 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2150 +               "add $16, %%"REG_S"                     \n\t"
2151 +               "sub $16, %%"REG_D"                     \n\t"
2152 +               "cmp $512, %%"REG_S"                    \n\t"
2153 +               " jb 1b                                 \n\t"
2154 +               :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
2155 +               : "%"REG_S, "%"REG_D
2156 +       );
2157 +       data_ptr+=128;
2158 +//     window_ptr+=128;
2159 +
2160 +    /* The trailing edge of the window goes into the delay line */
2161 +    delay_ptr = delay;
2162 +
2163 +       __asm__ volatile(
2164 +               "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
2165 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2166 +               ASMALIGN(4)
2167 +               "1:                                     \n\t"
2168 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
2169 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
2170 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
2171 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
2172 +               "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
2173 +               "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2174 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2175 +               "add $16, %%"REG_S"                     \n\t"
2176 +               "sub $16, %%"REG_D"                     \n\t"
2177 +               "cmp $512, %%"REG_S"                    \n\t"
2178 +               " jb 1b                                 \n\t"
2179 +               :: "r" (buf+64), "r" (delay_ptr)
2180 +               : "%"REG_S, "%"REG_D
2181 +       );
2182 +       delay_ptr+=128;
2183 +//     window_ptr-=128;
2184 +
2185 +       __asm__ volatile(
2186 +               "mov $1024, %%"REG_D"                   \n\t"  // 1024
2187 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2188 +               ASMALIGN(4)
2189 +               "1:                                     \n\t"
2190 +               "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
2191 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
2192 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
2193 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
2194 +               "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
2195 +               "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2196 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2197 +               "add $16, %%"REG_S"                     \n\t"
2198 +               "sub $16, %%"REG_D"                     \n\t"
2199 +               "cmp $512, %%"REG_S"                    \n\t"
2200 +               " jb 1b                                 \n\t"
2201 +               :: "r" (buf), "r" (delay_ptr)
2202 +               : "%"REG_S, "%"REG_D
2203 +       );
2204 +}
2205 +#endif // ARCH_X86 || ARCH_X86_64
2206 +
2207  void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
2208  {
2209      int i, k;
2210 @@ -364,7 +1141,7 @@
2211
2212  void a52_imdct_init (uint32_t mm_accel)
2213  {
2214 -    int i, k;
2215 +    int i, j, k;
2216      double sum;
2217
2218      /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
2219 @@ -416,6 +1193,99 @@
2220         post2[i].real = cos ((M_PI / 128) * (i + 0.5));
2221         post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
2222      }
2223 +    for (i = 0; i < 128; i++) {
2224 +       xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
2225 +       xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
2226 +    }
2227 +    for (i = 0; i < 7; i++) {
2228 +       j = 1 << i;
2229 +       for (k = 0; k < j; k++) {
2230 +           w[i][k].real = cos (-M_PI * k / j);
2231 +           w[i][k].imag = sin (-M_PI * k / j);
2232 +       }
2233 +    }
2234 +#if ARCH_X86 || ARCH_X86_64
2235 +       for (i = 0; i < 128; i++) {
2236 +           sseSinCos1c[2*i+0]= xcos1[i];
2237 +           sseSinCos1c[2*i+1]= -xcos1[i];
2238 +           sseSinCos1d[2*i+0]= xsin1[i];
2239 +           sseSinCos1d[2*i+1]= xsin1[i];
2240 +       }
2241 +       for (i = 1; i < 7; i++) {
2242 +           j = 1 << i;
2243 +           for (k = 0; k < j; k+=2) {
2244 +
2245 +               sseW[i][4*k + 0] = w[i][k+0].real;
2246 +               sseW[i][4*k + 1] = w[i][k+0].real;
2247 +               sseW[i][4*k + 2] = w[i][k+1].real;
2248 +               sseW[i][4*k + 3] = w[i][k+1].real;
2249 +
2250 +               sseW[i][4*k + 4] = -w[i][k+0].imag;
2251 +               sseW[i][4*k + 5] = w[i][k+0].imag;
2252 +               sseW[i][4*k + 6] = -w[i][k+1].imag;
2253 +               sseW[i][4*k + 7] = w[i][k+1].imag;
2254 +
2255 +       //we multiply more or less uninitalized numbers so we need to use exactly 0.0
2256 +               if(k==0)
2257 +               {
2258 +//                     sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
2259 +                       sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
2260 +               }
2261 +
2262 +               if(2*k == j)
2263 +               {
2264 +                       sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
2265 +//                     sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
2266 +               }
2267 +           }
2268 +       }
2269 +
2270 +       for(i=0; i<128; i++)
2271 +       {
2272 +               sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
2273 +               sseWindow[2*i+1]=  a52_imdct_window[2*i+1];
2274 +       }
2275 +
2276 +       for(i=0; i<64; i++)
2277 +       {
2278 +               sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
2279 +               sseWindow[256 + 2*i+1]=  a52_imdct_window[254 - 2*i+0];
2280 +               sseWindow[384 + 2*i+0]=  a52_imdct_window[126 - 2*i+1];
2281 +               sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
2282 +       }
2283 +#endif
2284 +       a52_imdct_512 = imdct_do_512;
2285 +       ifft128 = ifft128_c;
2286 +       ifft64 = ifft64_c;
2287 +
2288 +#if ARCH_X86 || ARCH_X86_64
2289 +       if(mm_accel & MM_ACCEL_X86_SSE)
2290 +       {
2291 +         fprintf (stderr, "Using SSE optimized IMDCT transform\n");
2292 +         a52_imdct_512 = imdct_do_512_sse;
2293 +       }
2294 +       else
2295 +       if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
2296 +       {
2297 +         fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
2298 +         a52_imdct_512 = imdct_do_512_3dnowex;
2299 +       }
2300 +       else
2301 +       if(mm_accel & MM_ACCEL_X86_3DNOW)
2302 +       {
2303 +         fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
2304 +         a52_imdct_512 = imdct_do_512_3dnow;
2305 +       }
2306 +       else
2307 +#endif // ARCH_X86 || ARCH_X86_64
2308 +#if HAVE_ALTIVEC
2309 +        if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
2310 +       {
2311 +         fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
2312 +          a52_imdct_512 = imdct_do_512_altivec;
2313 +       }
2314 +       else
2315 +#endif
2316
2317  #ifdef LIBA52_DJBFFT
2318      if (mm_accel & MM_ACCEL_DJBFFT) {
2319 @@ -426,7 +1296,5 @@
2320  #endif
2321      {
2322         fprintf (stderr, "No accelerated IMDCT transform found\n");
2323 -       ifft128 = ifft128_c;
2324 -       ifft64 = ifft64_c;
2325      }
2326  }
2327 --- include/mm_accel.h  2006-06-12 15:05:00.000000000 +0200
2328 +++ liba52/mm_accel.h   2006-06-05 02:23:04.000000000 +0200
2329 @@ -30,7 +34,12 @@
2330  /* x86 accelerations */
2331  #define MM_ACCEL_X86_MMX       0x80000000
2332  #define MM_ACCEL_X86_3DNOW     0x40000000
2333 +#define MM_ACCEL_X86_3DNOWEXT  0x08000000
2334  #define MM_ACCEL_X86_MMXEXT    0x20000000
2335 +#define MM_ACCEL_X86_SSE       0x10000000
2336 +
2337 +/* PPC accelerations */
2338 +#define MM_ACCEL_PPC_ALTIVEC   0x00010000
2339
2340  uint32_t mm_accel (void);
2341
2342 --- liba52/parse.c      2006-12-05 08:08:01.000000000 +0100
2343 +++ liba52/parse.c      2006-12-05 08:08:44.000000000 +0100
2344 @@ -24,6 +28,7 @@
2345  #include "config.h"
2346
2347  #include <stdlib.h>
2348 +#include <stdio.h>
2349  #include <string.h>
2350  #include <inttypes.h>
2351
2352 @@ -31,13 +36,12 @@
2353  #include "a52_internal.h"
2354  #include "bitstream.h"
2355  #include "tables.h"
2356 +#include "mm_accel.h"
2357 +#include "libavutil/avutil.h"
2358
2359 -#ifdef HAVE_MEMALIGN
2360 +#if HAVE_MEMALIGN
2361  /* some systems have memalign() but no declaration for it */
2362  void * memalign (size_t align, size_t size);
2363 -#else
2364 -/* assume malloc alignment is sufficient */
2365 -#define memalign(align,size) malloc (size)
2366  #endif
2367
2368  typedef struct {
2369 @@ -60,7 +64,16 @@
2370      if (state == NULL)
2371         return NULL;
2372
2373 +#if defined(__MINGW32__) && defined(HAVE_SSE)
2374 +    state->samples = av_malloc(256 * 12 * sizeof (sample_t));
2375 +#else
2376      state->samples = memalign (16, 256 * 12 * sizeof (sample_t));
2377 +#endif
2378 +    if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
2379 +      mm_accel &=~MM_ACCEL_X86_SSE;
2380 +      fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
2381 +    }
2382 +
2383      if (state->samples == NULL) {
2384         free (state);
2385         return NULL;
2386 @@ -74,6 +87,7 @@
2387      state->lfsr_state = 1;
2388
2389      a52_imdct_init (mm_accel);
2390 +    downmix_accel_init(mm_accel);
2391
2392      return state;
2393  }
2394 @@ -141,7 +155,7 @@
2395      state->acmod = acmod = buf[6] >> 5;
2396
2397      a52_bitstream_set_ptr (state, buf + 6);
2398 -    bitstream_get (state, 3);  /* skip acmod we already parsed */
2399 +    bitstream_skip (state, 3); /* skip acmod we already parsed */
2400
2401      if ((acmod == 2) && (bitstream_get (state, 2) == 2))       /* dsurmod */
2402         acmod = A52_DOLBY;
2403 @@ -172,28 +186,28 @@
2404
2405      chaninfo = !acmod;
2406      do {
2407 -       bitstream_get (state, 5);       /* dialnorm */
2408 +       bitstream_skip (state, 5);      /* dialnorm */
2409         if (bitstream_get (state, 1))   /* compre */
2410 -           bitstream_get (state, 8);   /* compr */
2411 +           bitstream_skip (state, 8);  /* compr */
2412         if (bitstream_get (state, 1))   /* langcode */
2413 -           bitstream_get (state, 8);   /* langcod */
2414 +           bitstream_skip (state, 8);  /* langcod */
2415         if (bitstream_get (state, 1))   /* audprodie */
2416 -           bitstream_get (state, 7);   /* mixlevel + roomtyp */
2417 +           bitstream_skip (state, 7);  /* mixlevel + roomtyp */
2418      } while (chaninfo--);
2419
2420 -    bitstream_get (state, 2);          /* copyrightb + origbs */
2421 +    bitstream_skip (state, 2);         /* copyrightb + origbs */
2422
2423      if (bitstream_get (state, 1))      /* timecod1e */
2424 -       bitstream_get (state, 14);      /* timecod1 */
2425 +       bitstream_skip (state, 14);     /* timecod1 */
2426      if (bitstream_get (state, 1))      /* timecod2e */
2427 -       bitstream_get (state, 14);      /* timecod2 */
2428 +       bitstream_skip (state, 14);     /* timecod2 */
2429
2430      if (bitstream_get (state, 1)) {    /* addbsie */
2431         int addbsil;
2432
2433         addbsil = bitstream_get (state, 6);
2434         do {
2435 -           bitstream_get (state, 8);   /* addbsi */
2436 +           bitstream_skip (state, 8);  /* addbsi */
2437         } while (addbsil--);
2438      }
2439
2440 @@ -680,7 +694,7 @@
2441                                  state->fbw_expbap[i].exp[0],
2442                                  state->fbw_expbap[i].exp + 1))
2443                 return 1;
2444 -           bitstream_get (state, 2);   /* gainrng */
2445 +           bitstream_skip (state, 2);  /* gainrng */
2446         }
2447      if (lfeexpstr != EXP_REUSE) {
2448         do_bit_alloc |= 32;
2449 @@ -755,7 +769,7 @@
2450      if (bitstream_get (state, 1)) {    /* skiple */
2451         i = bitstream_get (state, 9);   /* skipl */
2452         while (i--)
2453 -           bitstream_get (state, 8);
2454 +           bitstream_skip (state, 8);
2455      }
2456
2457      samples = state->samples;
2458 @@ -896,6 +910,10 @@
2459
2460  void a52_free (a52_state_t * state)
2461  {
2462 -    free (state->samples);
2463 +#if defined(__MINGW32__) && defined(HAVE_SSE)
2464 +    av_free (state->samples);
2465 +#else
2466 +     free (state->samples);
2467 +#endif
2468      free (state);
2469  }