liba52/liba52_changes.diff

   1 --- include/a52.h       2006-06-12 15:04:57.000000000 +0200
   2 +++ liba52/a52.h        2006-06-05 02:23:02.000000000 +0200
   3 @@ -59,4 +63,9 @@
   4  int a52_block (a52_state_t * state);
   5  void a52_free (a52_state_t * state);
   6
   7 +void* a52_resample_init(uint32_t mm_accel,int flags,int chans);
   8 +extern int (* a52_resample) (float * _f, int16_t * s16);
   9 +
  10 +uint16_t crc16_block(uint8_t *data,uint32_t num_bytes);
  11 +
  12  #endif /* A52_H */
  13 --- liba52/a52_internal.h       2006-06-12 15:05:07.000000000 +0200
  14 +++ liba52/a52_internal.h       2006-06-05 02:23:02.000000000 +0200
  15 @@ -103,18 +107,34 @@
  16  #define DELTA_BIT_NONE (2)
  17  #define DELTA_BIT_RESERVED (3)
  18
  19 +#ifdef ARCH_X86_64
  20 +# define REG_a "rax"
  21 +# define REG_d "rdx"
  22 +# define REG_S "rsi"
  23 +# define REG_D "rdi"
  24 +# define REG_BP "rbp"
  25 +#else
  26 +# define REG_a "eax"
  27 +# define REG_d "edx"
  28 +# define REG_S "esi"
  29 +# define REG_D "edi"
  30 +# define REG_BP "ebp"
  31 +#endif
  32 +
  33  void a52_bit_allocate (a52_state_t * state, ba_t * ba, int bndstart,
  34                        int start, int end, int fastleak, int slowleak,
  35                        expbap_t * expbap);
  36
  37  int a52_downmix_init (int input, int flags, sample_t * level,
  38                       sample_t clev, sample_t slev);
  39 +void downmix_accel_init(uint32_t mm_accel);
  40  int a52_downmix_coeff (sample_t * coeff, int acmod, int output, sample_t level,
  41                        sample_t clev, sample_t slev);
  42 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
  43 +extern void (*a52_downmix) (sample_t * samples, int acmod, int output, sample_t bias,
  44                   sample_t clev, sample_t slev);
  45 -void a52_upmix (sample_t * samples, int acmod, int output);
  46 +extern void (*a52_upmix) (sample_t * samples, int acmod, int output);
  47
  48  void a52_imdct_init (uint32_t mm_accel);
  49  void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias);
  50 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias);
  51 +extern void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
  52 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias);
  53 --- liba52/bitstream.c  2006-06-12 15:05:07.000000000 +0200
  54 +++ liba52/bitstream.c  2006-06-05 02:23:02.000000000 +0200
  55 @@ -31,6 +35,10 @@
  56
  57  #define BUFFER_SIZE 4096
  58
  59 +#ifdef ALT_BITSTREAM_READER
  60 +int indx=0;
  61 +#endif
  62 +
  63  void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf)
  64  {
  65      int align;
  66 @@ -38,6 +46,9 @@
  67      align = (long)buf & 3;
  68      state->buffer_start = (uint32_t *) (buf - align);
  69      state->bits_left = 0;
  70 +#ifdef ALT_BITSTREAM_READER
  71 +    indx=0;
  72 +#endif
  73      bitstream_get (state, align * 8);
  74  }
  75
  76 --- liba52/bitstream.h  2006-06-12 15:05:07.000000000 +0200
  77 +++ liba52/bitstream.h  2006-06-05 02:23:02.000000000 +0200
  78 @@ -21,6 +25,48 @@
  79   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  80   */
  81
  82 +/* code from ffmpeg/libavcodec */
  83 +#if defined(__GNUC__) && (__GNUC__ > 3 || __GNUC_ == 3 && __GNUC_MINOR__ > 0)
  84 +#    define always_inline __attribute__((always_inline)) inline
  85 +#else
  86 +#    define always_inline inline
  87 +#endif
  88 +
  89 +#if defined(__sparc__) || defined(hpux)
  90 +/*
  91 + * the alt bitstream reader performs unaligned memory accesses; that doesn't work
  92 + * on sparc/hpux.  For now, disable ALT_BITSTREAM_READER.
  93 + */
  94 +#undef ALT_BITSTREAM_READER
  95 +#else
  96 +// alternative (faster) bitstram reader (reades upto 3 bytes over the end of the input)
  97 +#define ALT_BITSTREAM_READER
  98 +
  99 +/* used to avoid misaligned exceptions on some archs (alpha, ...) */
 100 +#if defined (ARCH_X86) || defined(ARCH_ARMV4L)
 101 +#    define unaligned32(a) (*(uint32_t*)(a))
 102 +#else
 103 +#    ifdef __GNUC__
 104 +static always_inline uint32_t unaligned32(const void *v) {
 105 +    struct Unaligned {
 106 +       uint32_t i;
 107 +    } __attribute__((packed));
 108 +
 109 +    return ((const struct Unaligned *) v)->i;
 110 +}
 111 +#    elif defined(__DECC)
 112 +static inline uint32_t unaligned32(const void *v) {
 113 +    return *(const __unaligned uint32_t *) v;
 114 +}
 115 +#    else
 116 +static inline uint32_t unaligned32(const void *v) {
 117 +    return *(const uint32_t *) v;
 118 +}
 119 +#    endif
 120 +#endif //!ARCH_X86
 121 +
 122 +#endif
 123 +
 124  /* (stolen from the kernel) */
 125  #ifdef WORDS_BIGENDIAN
 126
 127 @@ -28,7 +74,7 @@
 128
 129  #else
 130
 131 -#      if 0 && defined (__i386__)
 132 +#      if defined (__i386__)
 133
 134  #      define swab32(x) __i386_swab32(x)
 135         static inline const uint32_t __i386_swab32(uint32_t x)
 136 @@ -39,19 +85,34 @@
 137
 138  #      else
 139
 140 -#      define swab32(x)\
 141 -((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |  \
 142 - (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]))
 143 -
 144 +#      define swab32(x) __generic_swab32(x)
 145 +       static always_inline const uint32_t __generic_swab32(uint32_t x)
 146 +       {
 147 +               return ((((uint8_t*)&x)[0] << 24) | (((uint8_t*)&x)[1] << 16) |
 148 +                (((uint8_t*)&x)[2] << 8)  | (((uint8_t*)&x)[3]));
 149 +       }
 150  #      endif
 151  #endif
 152
 153 +#ifdef ALT_BITSTREAM_READER
 154 +extern int indx;
 155 +#endif
 156 +
 157  void a52_bitstream_set_ptr (a52_state_t * state, uint8_t * buf);
 158  uint32_t a52_bitstream_get_bh (a52_state_t * state, uint32_t num_bits);
 159  int32_t a52_bitstream_get_bh_2 (a52_state_t * state, uint32_t num_bits);
 160
 161  static inline uint32_t bitstream_get (a52_state_t * state, uint32_t num_bits)
 162  {
 163 +#ifdef ALT_BITSTREAM_READER
 164 +    uint32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
 165 +
 166 +    result<<= (indx&0x07);
 167 +    result>>= 32 - num_bits;
 168 +    indx+= num_bits;
 169 +
 170 +    return result;
 171 +#else
 172      uint32_t result;
 173
 174      if (num_bits < state->bits_left) {
 175 @@ -61,10 +122,29 @@
 176      }
 177
 178      return a52_bitstream_get_bh (state, num_bits);
 179 +#endif
 180 +}
 181 +
 182 +static inline void bitstream_skip(a52_state_t * state, int num_bits)
 183 +{
 184 +#ifdef ALT_BITSTREAM_READER
 185 +       indx+= num_bits;
 186 +#else
 187 +       bitstream_get(state, num_bits);
 188 +#endif
 189  }
 190
 191  static inline int32_t bitstream_get_2 (a52_state_t * state, uint32_t num_bits)
 192  {
 193 +#ifdef ALT_BITSTREAM_READER
 194 +    int32_t result= swab32( unaligned32(((uint8_t *)state->buffer_start)+(indx>>3)) );
 195 +
 196 +    result<<= (indx&0x07);
 197 +    result>>= 32 - num_bits;
 198 +    indx+= num_bits;
 199 +
 200 +    return result;
 201 +#else
 202      int32_t result;
 203
 204      if (num_bits < state->bits_left) {
 205 @@ -74,4 +154,5 @@
 206      }
 207
 208      return a52_bitstream_get_bh_2 (state, num_bits);
 209 +#endif
 210  }
 211 --- liba52/downmix.c    2006-06-12 15:17:53.000000000 +0200
 212 +++ liba52/downmix.c    2006-06-05 02:23:02.000000000 +0200
 213 @@ -19,18 +23,46 @@
 214   * You should have received a copy of the GNU General Public License
 215   * along with this program; if not, write to the Free Software
 216   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 217 + *
 218 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
 219   */
 220
 221  #include "config.h"
 222
 223  #include <string.h>
 224  #include <inttypes.h>
 225
 226  #include "a52.h"
 227  #include "a52_internal.h"
 228 +#include "mm_accel.h"
 229
 230  #define CONVERT(acmod,output) (((output) << 3) + (acmod))
 231
 232 +
 233 +void (*a52_downmix)(sample_t * samples, int acmod, int output, sample_t bias,
 234 +             sample_t clev, sample_t slev)= NULL;
 235 +void (*a52_upmix)(sample_t * samples, int acmod, int output)= NULL;
 236 +
 237 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
 238 +             sample_t clev, sample_t slev);
 239 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
 240 +             sample_t clev, sample_t slev);
 241 +static void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
 242 +             sample_t clev, sample_t slev);
 243 +static void upmix_MMX (sample_t * samples, int acmod, int output);
 244 +static void upmix_C (sample_t * samples, int acmod, int output);
 245 +
 246 +void downmix_accel_init(uint32_t mm_accel)
 247 +{
 248 +    a52_upmix= upmix_C;
 249 +    a52_downmix= downmix_C;
 250 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
 251 +    if(mm_accel & MM_ACCEL_X86_MMX) a52_upmix= upmix_MMX;
 252 +    if(mm_accel & MM_ACCEL_X86_SSE) a52_downmix= downmix_SSE;
 253 +    if(mm_accel & MM_ACCEL_X86_3DNOW) a52_downmix= downmix_3dnow;
 254 +#endif
 255 +}
 256 +
 257  int a52_downmix_init (int input, int flags, sample_t * level,
 258                       sample_t clev, sample_t slev)
 259  {
 260 @@ -447,7 +479,7 @@
 261         samples[i] = 0;
 262  }
 263
 264 -void a52_downmix (sample_t * samples, int acmod, int output, sample_t bias,
 265 +void downmix_C (sample_t * samples, int acmod, int output, sample_t bias,
 266                   sample_t clev, sample_t slev)
 267  {
 268      switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 269 @@ -559,7 +591,7 @@
 270         break;
 271
 272      case CONVERT (A52_3F2R, A52_2F1R):
 273 -       mix3to2 (samples, bias);
 274 +       mix3to2 (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
 275         move2to1 (samples + 768, samples + 512, bias);
 276         break;
 277
 278 @@ -583,12 +615,12 @@
 279         break;
 280
 281      case CONVERT (A52_3F1R, A52_3F2R):
 282 -       memcpy (samples + 1027, samples + 768, 256 * sizeof (sample_t));
 283 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 284         break;
 285      }
 286  }
 287
 288 -void a52_upmix (sample_t * samples, int acmod, int output)
 289 +void upmix_C (sample_t * samples, int acmod, int output)
 290  {
 291      switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 292
 293 @@ -653,3 +685,1137 @@
 294         goto mix_31to21;
 295      }
 296  }
 297 +
 298 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
 299 +static void mix2to1_SSE (sample_t * dest, sample_t * src, sample_t bias)
 300 +{
 301 +       asm volatile(
 302 +       "movlps %2, %%xmm7              \n\t"
 303 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 304 +       "mov $-1024, %%"REG_S"          \n\t"
 305 +       ASMALIGN(4)
 306 +       "1:                             \n\t"
 307 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 308 +       "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
 309 +       "addps (%1, %%"REG_S"), %%xmm0  \n\t"
 310 +       "addps 16(%1, %%"REG_S"), %%xmm1\n\t"
 311 +       "addps %%xmm7, %%xmm0           \n\t"
 312 +       "addps %%xmm7, %%xmm1           \n\t"
 313 +       "movaps %%xmm0, (%1, %%"REG_S") \n\t"
 314 +       "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
 315 +       "add $32, %%"REG_S"             \n\t"
 316 +       " jnz 1b                        \n\t"
 317 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 318 +       : "%"REG_S
 319 +       );
 320 +}
 321 +
 322 +static void mix3to1_SSE (sample_t * samples, sample_t bias)
 323 +{
 324 +       asm volatile(
 325 +       "movlps %1, %%xmm7              \n\t"
 326 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 327 +       "mov $-1024, %%"REG_S"          \n\t"
 328 +       ASMALIGN(4)
 329 +       "1:                             \n\t"
 330 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 331 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 332 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 333 +       "addps %%xmm7, %%xmm1           \n\t"
 334 +       "addps %%xmm1, %%xmm0           \n\t"
 335 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 336 +       "add $16, %%"REG_S"             \n\t"
 337 +       " jnz 1b                        \n\t"
 338 +       :: "r" (samples+256), "m" (bias)
 339 +       : "%"REG_S
 340 +       );
 341 +}
 342 +
 343 +static void mix4to1_SSE (sample_t * samples, sample_t bias)
 344 +{
 345 +       asm volatile(
 346 +       "movlps %1, %%xmm7              \n\t"
 347 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 348 +       "mov $-1024, %%"REG_S"          \n\t"
 349 +       ASMALIGN(4)
 350 +       "1:                             \n\t"
 351 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 352 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 353 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 354 +       "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
 355 +       "addps %%xmm7, %%xmm0           \n\t"
 356 +       "addps %%xmm1, %%xmm0           \n\t"
 357 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 358 +       "add $16, %%"REG_S"             \n\t"
 359 +       " jnz 1b                        \n\t"
 360 +       :: "r" (samples+256), "m" (bias)
 361 +       : "%"REG_S
 362 +       );
 363 +}
 364 +
 365 +static void mix5to1_SSE (sample_t * samples, sample_t bias)
 366 +{
 367 +       asm volatile(
 368 +       "movlps %1, %%xmm7              \n\t"
 369 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 370 +       "mov $-1024, %%"REG_S"          \n\t"
 371 +       ASMALIGN(4)
 372 +       "1:                             \n\t"
 373 +       "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 374 +       "movaps 1024(%0, %%"REG_S"), %%xmm1\n\t"
 375 +       "addps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 376 +       "addps 3072(%0, %%"REG_S"), %%xmm1\n\t"
 377 +       "addps %%xmm7, %%xmm0           \n\t"
 378 +       "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
 379 +       "addps %%xmm1, %%xmm0           \n\t"
 380 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 381 +       "add $16, %%"REG_S"             \n\t"
 382 +       " jnz 1b                        \n\t"
 383 +       :: "r" (samples+256), "m" (bias)
 384 +       : "%"REG_S
 385 +       );
 386 +}
 387 +
 388 +static void mix3to2_SSE (sample_t * samples, sample_t bias)
 389 +{
 390 +       asm volatile(
 391 +       "movlps %1, %%xmm7              \n\t"
 392 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 393 +       "mov $-1024, %%"REG_S"          \n\t"
 394 +       ASMALIGN(4)
 395 +       "1:                             \n\t"
 396 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 397 +       "addps %%xmm7, %%xmm0           \n\t" //common
 398 +       "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 399 +       "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 400 +       "addps %%xmm0, %%xmm1           \n\t"
 401 +       "addps %%xmm0, %%xmm2           \n\t"
 402 +       "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 403 +       "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 404 +       "add $16, %%"REG_S"             \n\t"
 405 +       " jnz 1b                        \n\t"
 406 +       :: "r" (samples+256), "m" (bias)
 407 +       : "%"REG_S
 408 +       );
 409 +}
 410 +
 411 +static void mix21to2_SSE (sample_t * left, sample_t * right, sample_t bias)
 412 +{
 413 +       asm volatile(
 414 +               "movlps %2, %%xmm7              \n\t"
 415 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 416 +               "mov $-1024, %%"REG_S"          \n\t"
 417 +               ASMALIGN(4)
 418 +               "1:                             \n\t"
 419 +               "movaps 1024(%1, %%"REG_S"), %%xmm0\n\t"
 420 +               "addps %%xmm7, %%xmm0           \n\t" //common
 421 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 422 +               "movaps (%1, %%"REG_S"), %%xmm2 \n\t"
 423 +               "addps %%xmm0, %%xmm1           \n\t"
 424 +               "addps %%xmm0, %%xmm2           \n\t"
 425 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 426 +               "movaps %%xmm2, (%1, %%"REG_S") \n\t"
 427 +               "add $16, %%"REG_S"             \n\t"
 428 +               " jnz 1b                        \n\t"
 429 +       :: "r" (left+256), "r" (right+256), "m" (bias)
 430 +       : "%"REG_S
 431 +       );
 432 +}
 433 +
 434 +static void mix21toS_SSE (sample_t * samples, sample_t bias)
 435 +{
 436 +       asm volatile(
 437 +               "movlps %1, %%xmm7              \n\t"
 438 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 439 +               "mov $-1024, %%"REG_S"          \n\t"
 440 +               ASMALIGN(4)
 441 +               "1:                             \n\t"
 442 +               "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"  // surround
 443 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 444 +               "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
 445 +               "addps %%xmm7, %%xmm1           \n\t"
 446 +               "addps %%xmm7, %%xmm2           \n\t"
 447 +               "subps %%xmm0, %%xmm1           \n\t"
 448 +               "addps %%xmm0, %%xmm2           \n\t"
 449 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 450 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 451 +               "add $16, %%"REG_S"             \n\t"
 452 +               " jnz 1b                        \n\t"
 453 +       :: "r" (samples+256), "m" (bias)
 454 +       : "%"REG_S
 455 +       );
 456 +}
 457 +
 458 +static void mix31to2_SSE (sample_t * samples, sample_t bias)
 459 +{
 460 +       asm volatile(
 461 +               "movlps %1, %%xmm7              \n\t"
 462 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 463 +               "mov $-1024, %%"REG_S"          \n\t"
 464 +               ASMALIGN(4)
 465 +               "1:                             \n\t"
 466 +               "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 467 +               "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
 468 +               "addps %%xmm7, %%xmm0           \n\t" // common
 469 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 470 +               "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 471 +               "addps %%xmm0, %%xmm1           \n\t"
 472 +               "addps %%xmm0, %%xmm2           \n\t"
 473 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 474 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 475 +               "add $16, %%"REG_S"             \n\t"
 476 +               " jnz 1b                        \n\t"
 477 +       :: "r" (samples+256), "m" (bias)
 478 +       : "%"REG_S
 479 +       );
 480 +}
 481 +
 482 +static void mix31toS_SSE (sample_t * samples, sample_t bias)
 483 +{
 484 +       asm volatile(
 485 +               "movlps %1, %%xmm7              \n\t"
 486 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 487 +               "mov $-1024, %%"REG_S"          \n\t"
 488 +               ASMALIGN(4)
 489 +               "1:                             \n\t"
 490 +               "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 491 +               "movaps 3072(%0, %%"REG_S"), %%xmm3\n\t" // surround
 492 +               "addps %%xmm7, %%xmm0           \n\t" // common
 493 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 494 +               "movaps 2048(%0, %%"REG_S"), %%xmm2\n\t"
 495 +               "addps %%xmm0, %%xmm1           \n\t"
 496 +               "addps %%xmm0, %%xmm2           \n\t"
 497 +               "subps %%xmm3, %%xmm1           \n\t"
 498 +               "addps %%xmm3, %%xmm2           \n\t"
 499 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 500 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 501 +               "add $16, %%"REG_S"             \n\t"
 502 +               " jnz 1b                        \n\t"
 503 +       :: "r" (samples+256), "m" (bias)
 504 +       : "%"REG_S
 505 +       );
 506 +}
 507 +
 508 +static void mix22toS_SSE (sample_t * samples, sample_t bias)
 509 +{
 510 +       asm volatile(
 511 +               "movlps %1, %%xmm7              \n\t"
 512 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 513 +               "mov $-1024, %%"REG_S"          \n\t"
 514 +               ASMALIGN(4)
 515 +               "1:                             \n\t"
 516 +               "movaps 2048(%0, %%"REG_S"), %%xmm0\n\t"
 517 +               "addps 3072(%0, %%"REG_S"), %%xmm0\n\t" // surround
 518 +               "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 519 +               "movaps 1024(%0, %%"REG_S"), %%xmm2\n\t"
 520 +               "addps %%xmm7, %%xmm1           \n\t"
 521 +               "addps %%xmm7, %%xmm2           \n\t"
 522 +               "subps %%xmm0, %%xmm1           \n\t"
 523 +               "addps %%xmm0, %%xmm2           \n\t"
 524 +               "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 525 +               "movaps %%xmm2, 1024(%0, %%"REG_S")\n\t"
 526 +               "add $16, %%"REG_S"             \n\t"
 527 +               " jnz 1b                        \n\t"
 528 +       :: "r" (samples+256), "m" (bias)
 529 +       : "%"REG_S
 530 +       );
 531 +}
 532 +
 533 +static void mix32to2_SSE (sample_t * samples, sample_t bias)
 534 +{
 535 +       asm volatile(
 536 +       "movlps %1, %%xmm7              \n\t"
 537 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 538 +       "mov $-1024, %%"REG_S"          \n\t"
 539 +       ASMALIGN(4)
 540 +       "1:                             \n\t"
 541 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 542 +       "addps %%xmm7, %%xmm0           \n\t" // common
 543 +       "movaps %%xmm0, %%xmm1          \n\t" // common
 544 +       "addps (%0, %%"REG_S"), %%xmm0  \n\t"
 545 +       "addps 2048(%0, %%"REG_S"), %%xmm1\n\t"
 546 +       "addps 3072(%0, %%"REG_S"), %%xmm0\n\t"
 547 +       "addps 4096(%0, %%"REG_S"), %%xmm1\n\t"
 548 +       "movaps %%xmm0, (%0, %%"REG_S") \n\t"
 549 +       "movaps %%xmm1, 1024(%0, %%"REG_S")\n\t"
 550 +       "add $16, %%"REG_S"             \n\t"
 551 +       " jnz 1b                        \n\t"
 552 +       :: "r" (samples+256), "m" (bias)
 553 +       : "%"REG_S
 554 +       );
 555 +}
 556 +
 557 +static void mix32toS_SSE (sample_t * samples, sample_t bias)
 558 +{
 559 +       asm volatile(
 560 +       "movlps %1, %%xmm7              \n\t"
 561 +       "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 562 +       "mov $-1024, %%"REG_S"          \n\t"
 563 +       ASMALIGN(4)
 564 +       "1:                             \n\t"
 565 +       "movaps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 566 +       "movaps 3072(%0, %%"REG_S"), %%xmm2\n\t"
 567 +       "addps %%xmm7, %%xmm0           \n\t" // common
 568 +       "addps 4096(%0, %%"REG_S"), %%xmm2\n\t" // surround
 569 +       "movaps (%0, %%"REG_S"), %%xmm1 \n\t"
 570 +       "movaps 2048(%0, %%"REG_S"), %%xmm3\n\t"
 571 +       "subps %%xmm2, %%xmm1           \n\t"
 572 +       "addps %%xmm2, %%xmm3           \n\t"
 573 +       "addps %%xmm0, %%xmm1           \n\t"
 574 +       "addps %%xmm0, %%xmm3           \n\t"
 575 +       "movaps %%xmm1, (%0, %%"REG_S") \n\t"
 576 +       "movaps %%xmm3, 1024(%0, %%"REG_S")\n\t"
 577 +       "add $16, %%"REG_S"             \n\t"
 578 +       " jnz 1b                        \n\t"
 579 +       :: "r" (samples+256), "m" (bias)
 580 +       : "%"REG_S
 581 +       );
 582 +}
 583 +
 584 +static void move2to1_SSE (sample_t * src, sample_t * dest, sample_t bias)
 585 +{
 586 +       asm volatile(
 587 +               "movlps %2, %%xmm7              \n\t"
 588 +               "shufps $0x00, %%xmm7, %%xmm7   \n\t"
 589 +               "mov $-1024, %%"REG_S"          \n\t"
 590 +               ASMALIGN(4)
 591 +               "1:                             \n\t"
 592 +               "movaps (%0, %%"REG_S"), %%xmm0 \n\t"
 593 +               "movaps 16(%0, %%"REG_S"), %%xmm1\n\t"
 594 +               "addps 1024(%0, %%"REG_S"), %%xmm0\n\t"
 595 +               "addps 1040(%0, %%"REG_S"), %%xmm1\n\t"
 596 +               "addps %%xmm7, %%xmm0           \n\t"
 597 +               "addps %%xmm7, %%xmm1           \n\t"
 598 +               "movaps %%xmm0, (%1, %%"REG_S") \n\t"
 599 +               "movaps %%xmm1, 16(%1, %%"REG_S")\n\t"
 600 +               "add $32, %%"REG_S"             \n\t"
 601 +               " jnz 1b                        \n\t"
 602 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 603 +       : "%"REG_S
 604 +       );
 605 +}
 606 +
 607 +static void zero_MMX(sample_t * samples)
 608 +{
 609 +       asm volatile(
 610 +               "mov $-1024, %%"REG_S"          \n\t"
 611 +               "pxor %%mm0, %%mm0              \n\t"
 612 +               ASMALIGN(4)
 613 +               "1:                             \n\t"
 614 +               "movq %%mm0, (%0, %%"REG_S")    \n\t"
 615 +               "movq %%mm0, 8(%0, %%"REG_S")   \n\t"
 616 +               "movq %%mm0, 16(%0, %%"REG_S")  \n\t"
 617 +               "movq %%mm0, 24(%0, %%"REG_S")  \n\t"
 618 +               "add $32, %%"REG_S"             \n\t"
 619 +               " jnz 1b                        \n\t"
 620 +               "emms"
 621 +       :: "r" (samples+256)
 622 +       : "%"REG_S
 623 +       );
 624 +}
 625 +
 626 +/*
 627 + I hope dest and src will be at least 8 byte aligned and size
 628 + will devide on 8 without remain
 629 + Note: untested and unused.
 630 +*/
 631 +static void copy_MMX(void *dest,const void *src,unsigned size)
 632 +{
 633 +  unsigned i;
 634 +  size /= 64;
 635 +       for(i=0;i<size;i++)
 636 +       {
 637 +           __asm __volatile(
 638 +               "movq   %0,   %%mm0\n\t"
 639 +               "movq   8%0,  %%mm1\n\t"
 640 +               "movq   16%0, %%mm2\n\t"
 641 +               "movq   24%0, %%mm3\n\t"
 642 +               "movq   32%0, %%mm4\n\t"
 643 +               "movq   40%0, %%mm5\n\t"
 644 +               "movq   48%0, %%mm6\n\t"
 645 +               "movq   56%0, %%mm7\n\t"
 646 +               "movq   %%mm0, %1\n\t"
 647 +               "movq   %%mm1, 8%1\n\t"
 648 +               "movq   %%mm2, 16%1\n\t"
 649 +               "movq   %%mm3, 24%1\n\t"
 650 +               "movq   %%mm4, 32%1\n\t"
 651 +               "movq   %%mm5, 40%1\n\t"
 652 +               "movq   %%mm6, 48%1\n\t"
 653 +               "movq   %%mm7, 56%1\n\t"
 654 +               :
 655 +               :"m"(src),"m"(dest));
 656 +       }
 657 +}
 658 +
 659 +static void downmix_SSE (sample_t * samples, int acmod, int output, sample_t bias,
 660 +             sample_t clev, sample_t slev)
 661 +{
 662 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 663 +
 664 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
 665 +       memcpy (samples, samples + 256, 256 * sizeof (sample_t));
 666 +       break;
 667 +
 668 +    case CONVERT (A52_CHANNEL, A52_MONO):
 669 +    case CONVERT (A52_STEREO, A52_MONO):
 670 +    mix_2to1_SSE:
 671 +       mix2to1_SSE (samples, samples + 256, bias);
 672 +       break;
 673 +
 674 +    case CONVERT (A52_2F1R, A52_MONO):
 675 +       if (slev == 0)
 676 +           goto mix_2to1_SSE;
 677 +    case CONVERT (A52_3F, A52_MONO):
 678 +    mix_3to1_SSE:
 679 +       mix3to1_SSE (samples, bias);
 680 +       break;
 681 +
 682 +    case CONVERT (A52_3F1R, A52_MONO):
 683 +       if (slev == 0)
 684 +           goto mix_3to1_SSE;
 685 +    case CONVERT (A52_2F2R, A52_MONO):
 686 +       if (slev == 0)
 687 +           goto mix_2to1_SSE;
 688 +       mix4to1_SSE (samples, bias);
 689 +       break;
 690 +
 691 +    case CONVERT (A52_3F2R, A52_MONO):
 692 +       if (slev == 0)
 693 +           goto mix_3to1_SSE;
 694 +       mix5to1_SSE (samples, bias);
 695 +       break;
 696 +
 697 +    case CONVERT (A52_MONO, A52_DOLBY):
 698 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
 699 +       break;
 700 +
 701 +    case CONVERT (A52_3F, A52_STEREO):
 702 +    case CONVERT (A52_3F, A52_DOLBY):
 703 +    mix_3to2_SSE:
 704 +       mix3to2_SSE (samples, bias);
 705 +       break;
 706 +
 707 +    case CONVERT (A52_2F1R, A52_STEREO):
 708 +       if (slev == 0)
 709 +           break;
 710 +       mix21to2_SSE (samples, samples + 256, bias);
 711 +       break;
 712 +
 713 +    case CONVERT (A52_2F1R, A52_DOLBY):
 714 +       mix21toS_SSE (samples, bias);
 715 +       break;
 716 +
 717 +    case CONVERT (A52_3F1R, A52_STEREO):
 718 +       if (slev == 0)
 719 +           goto mix_3to2_SSE;
 720 +       mix31to2_SSE (samples, bias);
 721 +       break;
 722 +
 723 +    case CONVERT (A52_3F1R, A52_DOLBY):
 724 +       mix31toS_SSE (samples, bias);
 725 +       break;
 726 +
 727 +    case CONVERT (A52_2F2R, A52_STEREO):
 728 +       if (slev == 0)
 729 +           break;
 730 +       mix2to1_SSE (samples, samples + 512, bias);
 731 +       mix2to1_SSE (samples + 256, samples + 768, bias);
 732 +       break;
 733 +
 734 +    case CONVERT (A52_2F2R, A52_DOLBY):
 735 +       mix22toS_SSE (samples, bias);
 736 +       break;
 737 +
 738 +    case CONVERT (A52_3F2R, A52_STEREO):
 739 +       if (slev == 0)
 740 +           goto mix_3to2_SSE;
 741 +       mix32to2_SSE (samples, bias);
 742 +       break;
 743 +
 744 +    case CONVERT (A52_3F2R, A52_DOLBY):
 745 +       mix32toS_SSE (samples, bias);
 746 +       break;
 747 +
 748 +    case CONVERT (A52_3F1R, A52_3F):
 749 +       if (slev == 0)
 750 +           break;
 751 +       mix21to2_SSE (samples, samples + 512, bias);
 752 +       break;
 753 +
 754 +    case CONVERT (A52_3F2R, A52_3F):
 755 +       if (slev == 0)
 756 +           break;
 757 +       mix2to1_SSE (samples, samples + 768, bias);
 758 +       mix2to1_SSE (samples + 512, samples + 1024, bias);
 759 +       break;
 760 +
 761 +    case CONVERT (A52_3F1R, A52_2F1R):
 762 +       mix3to2_SSE (samples, bias);
 763 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 764 +       break;
 765 +
 766 +    case CONVERT (A52_2F2R, A52_2F1R):
 767 +       mix2to1_SSE (samples + 512, samples + 768, bias);
 768 +       break;
 769 +
 770 +    case CONVERT (A52_3F2R, A52_2F1R):
 771 +       mix3to2_SSE (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
 772 +       move2to1_SSE (samples + 768, samples + 512, bias);
 773 +       break;
 774 +
 775 +    case CONVERT (A52_3F2R, A52_3F1R):
 776 +       mix2to1_SSE (samples + 768, samples + 1024, bias);
 777 +       break;
 778 +
 779 +    case CONVERT (A52_2F1R, A52_2F2R):
 780 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
 781 +       break;
 782 +
 783 +    case CONVERT (A52_3F1R, A52_2F2R):
 784 +       mix3to2_SSE (samples, bias);
 785 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 786 +       break;
 787 +
 788 +    case CONVERT (A52_3F2R, A52_2F2R):
 789 +       mix3to2_SSE (samples, bias);
 790 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
 791 +       memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
 792 +       break;
 793 +
 794 +    case CONVERT (A52_3F1R, A52_3F2R):
 795 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 796 +       break;
 797 +    }
 798 +}
 799 +
 800 +static void upmix_MMX (sample_t * samples, int acmod, int output)
 801 +{
 802 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
 803 +
 804 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
 805 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
 806 +       break;
 807 +
 808 +    case CONVERT (A52_3F2R, A52_MONO):
 809 +       zero_MMX (samples + 1024);
 810 +    case CONVERT (A52_3F1R, A52_MONO):
 811 +    case CONVERT (A52_2F2R, A52_MONO):
 812 +       zero_MMX (samples + 768);
 813 +    case CONVERT (A52_3F, A52_MONO):
 814 +    case CONVERT (A52_2F1R, A52_MONO):
 815 +       zero_MMX (samples + 512);
 816 +    case CONVERT (A52_CHANNEL, A52_MONO):
 817 +    case CONVERT (A52_STEREO, A52_MONO):
 818 +       zero_MMX (samples + 256);
 819 +       break;
 820 +
 821 +    case CONVERT (A52_3F2R, A52_STEREO):
 822 +    case CONVERT (A52_3F2R, A52_DOLBY):
 823 +       zero_MMX (samples + 1024);
 824 +    case CONVERT (A52_3F1R, A52_STEREO):
 825 +    case CONVERT (A52_3F1R, A52_DOLBY):
 826 +       zero_MMX (samples + 768);
 827 +    case CONVERT (A52_3F, A52_STEREO):
 828 +    case CONVERT (A52_3F, A52_DOLBY):
 829 +    mix_3to2_MMX:
 830 +       memcpy (samples + 512, samples + 256, 256 * sizeof (sample_t));
 831 +       zero_MMX (samples + 256);
 832 +       break;
 833 +
 834 +    case CONVERT (A52_2F2R, A52_STEREO):
 835 +    case CONVERT (A52_2F2R, A52_DOLBY):
 836 +       zero_MMX (samples + 768);
 837 +    case CONVERT (A52_2F1R, A52_STEREO):
 838 +    case CONVERT (A52_2F1R, A52_DOLBY):
 839 +       zero_MMX (samples + 512);
 840 +       break;
 841 +
 842 +    case CONVERT (A52_3F2R, A52_3F):
 843 +       zero_MMX (samples + 1024);
 844 +    case CONVERT (A52_3F1R, A52_3F):
 845 +    case CONVERT (A52_2F2R, A52_2F1R):
 846 +       zero_MMX (samples + 768);
 847 +       break;
 848 +
 849 +    case CONVERT (A52_3F2R, A52_3F1R):
 850 +       zero_MMX (samples + 1024);
 851 +       break;
 852 +
 853 +    case CONVERT (A52_3F2R, A52_2F1R):
 854 +       zero_MMX (samples + 1024);
 855 +    case CONVERT (A52_3F1R, A52_2F1R):
 856 +    mix_31to21_MMX:
 857 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
 858 +       goto mix_3to2_MMX;
 859 +
 860 +    case CONVERT (A52_3F2R, A52_2F2R):
 861 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
 862 +       goto mix_31to21_MMX;
 863 +    }
 864 +}
 865 +
 866 +static void mix2to1_3dnow (sample_t * dest, sample_t * src, sample_t bias)
 867 +{
 868 +       asm volatile(
 869 +       "movd  %2, %%mm7        \n\t"
 870 +       "punpckldq %2, %%mm7    \n\t"
 871 +       "mov $-1024, %%"REG_S"  \n\t"
 872 +       ASMALIGN(4)
 873 +       "1:                     \n\t"
 874 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 875 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 876 +       "movq  16(%0, %%"REG_S"), %%mm2 \n\t"
 877 +       "movq  24(%0, %%"REG_S"), %%mm3 \n\t"
 878 +       "pfadd (%1, %%"REG_S"), %%mm0   \n\t"
 879 +       "pfadd 8(%1, %%"REG_S"), %%mm1  \n\t"
 880 +       "pfadd 16(%1, %%"REG_S"), %%mm2 \n\t"
 881 +       "pfadd 24(%1, %%"REG_S"), %%mm3 \n\t"
 882 +       "pfadd %%mm7, %%mm0             \n\t"
 883 +       "pfadd %%mm7, %%mm1             \n\t"
 884 +       "pfadd %%mm7, %%mm2             \n\t"
 885 +       "pfadd %%mm7, %%mm3             \n\t"
 886 +       "movq  %%mm0, (%1, %%"REG_S")   \n\t"
 887 +       "movq  %%mm1, 8(%1, %%"REG_S")  \n\t"
 888 +       "movq  %%mm2, 16(%1, %%"REG_S") \n\t"
 889 +       "movq  %%mm3, 24(%1, %%"REG_S") \n\t"
 890 +       "add $32, %%"REG_S"             \n\t"
 891 +       " jnz 1b                        \n\t"
 892 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
 893 +       : "%"REG_S
 894 +       );
 895 +}
 896 +
 897 +static void mix3to1_3dnow (sample_t * samples, sample_t bias)
 898 +{
 899 +       asm volatile(
 900 +       "movd  %1, %%mm7        \n\t"
 901 +       "punpckldq %1, %%mm7    \n\t"
 902 +       "mov $-1024, %%"REG_S"  \n\t"
 903 +       ASMALIGN(4)
 904 +       "1:                     \n\t"
 905 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 906 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 907 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 908 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 909 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 910 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 911 +       "pfadd %%mm7, %%mm0             \n\t"
 912 +       "pfadd %%mm7, %%mm1             \n\t"
 913 +       "pfadd %%mm2, %%mm0             \n\t"
 914 +       "pfadd %%mm3, %%mm1             \n\t"
 915 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 916 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 917 +       "add $16, %%"REG_S"             \n\t"
 918 +       " jnz 1b                        \n\t"
 919 +       :: "r" (samples+256), "m" (bias)
 920 +       : "%"REG_S
 921 +       );
 922 +}
 923 +
 924 +static void mix4to1_3dnow (sample_t * samples, sample_t bias)
 925 +{
 926 +       asm volatile(
 927 +       "movd  %1, %%mm7        \n\t"
 928 +       "punpckldq %1, %%mm7    \n\t"
 929 +       "mov $-1024, %%"REG_S"  \n\t"
 930 +       ASMALIGN(4)
 931 +       "1:                     \n\t"
 932 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 933 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 934 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 935 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 936 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 937 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 938 +       "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
 939 +       "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
 940 +       "pfadd %%mm7, %%mm0             \n\t"
 941 +       "pfadd %%mm7, %%mm1             \n\t"
 942 +       "pfadd %%mm2, %%mm0             \n\t"
 943 +       "pfadd %%mm3, %%mm1             \n\t"
 944 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 945 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 946 +       "add $16, %%"REG_S"             \n\t"
 947 +       " jnz 1b                        \n\t"
 948 +       :: "r" (samples+256), "m" (bias)
 949 +       : "%"REG_S
 950 +       );
 951 +}
 952 +
 953 +static void mix5to1_3dnow (sample_t * samples, sample_t bias)
 954 +{
 955 +       asm volatile(
 956 +       "movd  %1, %%mm7        \n\t"
 957 +       "punpckldq %1, %%mm7    \n\t"
 958 +       "mov $-1024, %%"REG_S"  \n\t"
 959 +       ASMALIGN(4)
 960 +       "1:                     \n\t"
 961 +       "movq  (%0, %%"REG_S"), %%mm0   \n\t"
 962 +       "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
 963 +       "movq  1024(%0, %%"REG_S"), %%mm2\n\t"
 964 +       "movq  1032(%0, %%"REG_S"), %%mm3\n\t"
 965 +       "pfadd 2048(%0, %%"REG_S"), %%mm0\n\t"
 966 +       "pfadd 2056(%0, %%"REG_S"), %%mm1\n\t"
 967 +       "pfadd 3072(%0, %%"REG_S"), %%mm2\n\t"
 968 +       "pfadd 3080(%0, %%"REG_S"), %%mm3\n\t"
 969 +       "pfadd %%mm7, %%mm0             \n\t"
 970 +       "pfadd %%mm7, %%mm1             \n\t"
 971 +       "pfadd 4096(%0, %%"REG_S"), %%mm2\n\t"
 972 +       "pfadd 4104(%0, %%"REG_S"), %%mm3\n\t"
 973 +       "pfadd %%mm2, %%mm0             \n\t"
 974 +       "pfadd %%mm3, %%mm1             \n\t"
 975 +       "movq  %%mm0, (%0, %%"REG_S")   \n\t"
 976 +       "movq  %%mm1, 8(%0, %%"REG_S")  \n\t"
 977 +       "add $16, %%"REG_S"             \n\t"
 978 +       " jnz 1b                        \n\t"
 979 +       :: "r" (samples+256), "m" (bias)
 980 +       : "%"REG_S
 981 +       );
 982 +}
 983 +
 984 +static void mix3to2_3dnow (sample_t * samples, sample_t bias)
 985 +{
 986 +       asm volatile(
 987 +       "movd  %1, %%mm7        \n\t"
 988 +       "punpckldq %1, %%mm7    \n\t"
 989 +       "mov $-1024, %%"REG_S"  \n\t"
 990 +       ASMALIGN(4)
 991 +       "1:                     \n\t"
 992 +       "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
 993 +       "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
 994 +       "pfadd  %%mm7, %%mm0            \n\t" //common
 995 +       "pfadd  %%mm7, %%mm1            \n\t" //common
 996 +       "movq   (%0, %%"REG_S"), %%mm2  \n\t"
 997 +       "movq   8(%0, %%"REG_S"), %%mm3 \n\t"
 998 +       "movq   2048(%0, %%"REG_S"), %%mm4\n\t"
 999 +       "movq   2056(%0, %%"REG_S"), %%mm5\n\t"
1000 +       "pfadd  %%mm0, %%mm2            \n\t"
1001 +       "pfadd  %%mm1, %%mm3            \n\t"
1002 +       "pfadd  %%mm0, %%mm4            \n\t"
1003 +       "pfadd  %%mm1, %%mm5            \n\t"
1004 +       "movq   %%mm2, (%0, %%"REG_S")  \n\t"
1005 +       "movq   %%mm3, 8(%0, %%"REG_S") \n\t"
1006 +       "movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
1007 +       "movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
1008 +       "add $16, %%"REG_S"             \n\t"
1009 +       " jnz 1b                        \n\t"
1010 +       :: "r" (samples+256), "m" (bias)
1011 +       : "%"REG_S
1012 +       );
1013 +}
1014 +
1015 +static void mix21to2_3dnow (sample_t * left, sample_t * right, sample_t bias)
1016 +{
1017 +       asm volatile(
1018 +               "movd  %2, %%mm7        \n\t"
1019 +               "punpckldq %2, %%mm7    \n\t"
1020 +               "mov $-1024, %%"REG_S"  \n\t"
1021 +               ASMALIGN(4)
1022 +               "1:                     \n\t"
1023 +               "movq  1024(%1, %%"REG_S"), %%mm0\n\t"
1024 +               "movq  1032(%1, %%"REG_S"), %%mm1\n\t"
1025 +               "pfadd %%mm7, %%mm0             \n\t" //common
1026 +               "pfadd %%mm7, %%mm1             \n\t" //common
1027 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1028 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1029 +               "movq  (%1, %%"REG_S"), %%mm4   \n\t"
1030 +               "movq  8(%1, %%"REG_S"), %%mm5  \n\t"
1031 +               "pfadd %%mm0, %%mm2             \n\t"
1032 +               "pfadd %%mm1, %%mm3             \n\t"
1033 +               "pfadd %%mm0, %%mm4             \n\t"
1034 +               "pfadd %%mm1, %%mm5             \n\t"
1035 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1036 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1037 +               "movq  %%mm4, (%1, %%"REG_S")   \n\t"
1038 +               "movq  %%mm5, 8(%1, %%"REG_S")  \n\t"
1039 +               "add $16, %%"REG_S"             \n\t"
1040 +               " jnz 1b                        \n\t"
1041 +       :: "r" (left+256), "r" (right+256), "m" (bias)
1042 +       : "%"REG_S
1043 +       );
1044 +}
1045 +
1046 +static void mix21toS_3dnow (sample_t * samples, sample_t bias)
1047 +{
1048 +       asm volatile(
1049 +               "movd  %1, %%mm7        \n\t"
1050 +               "punpckldq %1, %%mm7    \n\t"
1051 +               "mov $-1024, %%"REG_S"  \n\t"
1052 +               ASMALIGN(4)
1053 +               "1:                     \n\t"
1054 +               "movq  2048(%0, %%"REG_S"), %%mm0\n\t"  // surround
1055 +               "movq  2056(%0, %%"REG_S"), %%mm1\n\t"  // surround
1056 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1057 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1058 +               "movq  1024(%0, %%"REG_S"), %%mm4\n\t"
1059 +               "movq  1032(%0, %%"REG_S"), %%mm5\n\t"
1060 +               "pfadd %%mm7, %%mm2             \n\t"
1061 +               "pfadd %%mm7, %%mm3             \n\t"
1062 +               "pfadd %%mm7, %%mm4             \n\t"
1063 +               "pfadd %%mm7, %%mm5             \n\t"
1064 +               "pfsub %%mm0, %%mm2             \n\t"
1065 +               "pfsub %%mm1, %%mm3             \n\t"
1066 +               "pfadd %%mm0, %%mm4             \n\t"
1067 +               "pfadd %%mm1, %%mm5             \n\t"
1068 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1069 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1070 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1071 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1072 +               "add $16, %%"REG_S"             \n\t"
1073 +               " jnz 1b                        \n\t"
1074 +       :: "r" (samples+256), "m" (bias)
1075 +       : "%"REG_S
1076 +       );
1077 +}
1078 +
1079 +static void mix31to2_3dnow (sample_t * samples, sample_t bias)
1080 +{
1081 +       asm volatile(
1082 +               "movd  %1, %%mm7        \n\t"
1083 +               "punpckldq %1, %%mm7    \n\t"
1084 +               "mov $-1024, %%"REG_S"  \n\t"
1085 +               ASMALIGN(4)
1086 +               "1:                     \n\t"
1087 +               "movq  1024(%0, %%"REG_S"), %%mm0\n\t"
1088 +               "movq  1032(%0, %%"REG_S"), %%mm1\n\t"
1089 +               "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t"
1090 +               "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t"
1091 +               "pfadd %%mm7, %%mm0             \n\t" // common
1092 +               "pfadd %%mm7, %%mm1             \n\t" // common
1093 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1094 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1095 +               "movq  2048(%0, %%"REG_S"), %%mm4\n\t"
1096 +               "movq  2056(%0, %%"REG_S"), %%mm5\n\t"
1097 +               "pfadd %%mm0, %%mm2             \n\t"
1098 +               "pfadd %%mm1, %%mm3             \n\t"
1099 +               "pfadd %%mm0, %%mm4             \n\t"
1100 +               "pfadd %%mm1, %%mm5             \n\t"
1101 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1102 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1103 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1104 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1105 +               "add $16, %%"REG_S"             \n\t"
1106 +               " jnz 1b                        \n\t"
1107 +       :: "r" (samples+256), "m" (bias)
1108 +       : "%"REG_S
1109 +       );
1110 +}
1111 +
1112 +static void mix31toS_3dnow (sample_t * samples, sample_t bias)
1113 +{
1114 +       asm volatile(
1115 +               "movd  %1, %%mm7        \n\t"
1116 +               "punpckldq %1, %%mm7    \n\t"
1117 +               "mov $-1024, %%"REG_S"  \n\t"
1118 +               ASMALIGN(4)
1119 +               "1:                     \n\t"
1120 +               "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
1121 +               "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
1122 +               "pfadd  %%mm7, %%mm0            \n\t" // common
1123 +               "pfadd  %%mm7, %%mm1            \n\t" // common
1124 +               "movq   (%0, %%"REG_S"), %%mm2  \n\t"
1125 +               "movq   8(%0, %%"REG_S"), %%mm3 \n\t"
1126 +               "movq   2048(%0, %%"REG_S"), %%mm4\n\t"
1127 +               "movq   2056(%0, %%"REG_S"), %%mm5\n\t"
1128 +               "pfadd  %%mm0, %%mm2            \n\t"
1129 +               "pfadd  %%mm1, %%mm3            \n\t"
1130 +               "pfadd  %%mm0, %%mm4            \n\t"
1131 +               "pfadd  %%mm1, %%mm5            \n\t"
1132 +               "movq   3072(%0, %%"REG_S"), %%mm0\n\t" // surround
1133 +               "movq   3080(%0, %%"REG_S"), %%mm1\n\t" // surround
1134 +               "pfsub  %%mm0, %%mm2            \n\t"
1135 +               "pfsub  %%mm1, %%mm3            \n\t"
1136 +               "pfadd  %%mm0, %%mm4            \n\t"
1137 +               "pfadd  %%mm1, %%mm5            \n\t"
1138 +               "movq   %%mm2, (%0, %%"REG_S")  \n\t"
1139 +               "movq   %%mm3, 8(%0, %%"REG_S") \n\t"
1140 +               "movq   %%mm4, 1024(%0, %%"REG_S")\n\t"
1141 +               "movq   %%mm5, 1032(%0, %%"REG_S")\n\t"
1142 +               "add $16, %%"REG_S"             \n\t"
1143 +               " jnz 1b                        \n\t"
1144 +       :: "r" (samples+256), "m" (bias)
1145 +       : "%"REG_S
1146 +       );
1147 +}
1148 +
1149 +static void mix22toS_3dnow (sample_t * samples, sample_t bias)
1150 +{
1151 +       asm volatile(
1152 +               "movd  %1, %%mm7        \n\t"
1153 +               "punpckldq %1, %%mm7    \n\t"
1154 +               "mov $-1024, %%"REG_S"  \n\t"
1155 +               ASMALIGN(4)
1156 +               "1:                     \n\t"
1157 +               "movq  2048(%0, %%"REG_S"), %%mm0\n\t"
1158 +               "movq  2056(%0, %%"REG_S"), %%mm1\n\t"
1159 +               "pfadd 3072(%0, %%"REG_S"), %%mm0\n\t" // surround
1160 +               "pfadd 3080(%0, %%"REG_S"), %%mm1\n\t" // surround
1161 +               "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1162 +               "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1163 +               "movq  1024(%0, %%"REG_S"), %%mm4\n\t"
1164 +               "movq  1032(%0, %%"REG_S"), %%mm5\n\t"
1165 +               "pfadd %%mm7, %%mm2             \n\t"
1166 +               "pfadd %%mm7, %%mm3             \n\t"
1167 +               "pfadd %%mm7, %%mm4             \n\t"
1168 +               "pfadd %%mm7, %%mm5             \n\t"
1169 +               "pfsub %%mm0, %%mm2             \n\t"
1170 +               "pfsub %%mm1, %%mm3             \n\t"
1171 +               "pfadd %%mm0, %%mm4             \n\t"
1172 +               "pfadd %%mm1, %%mm5             \n\t"
1173 +               "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1174 +               "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1175 +               "movq  %%mm4, 1024(%0, %%"REG_S")\n\t"
1176 +               "movq  %%mm5, 1032(%0, %%"REG_S")\n\t"
1177 +               "add $16, %%"REG_S"             \n\t"
1178 +               " jnz 1b                        \n\t"
1179 +       :: "r" (samples+256), "m" (bias)
1180 +       : "%"REG_S
1181 +       );
1182 +}
1183 +
1184 +static void mix32to2_3dnow (sample_t * samples, sample_t bias)
1185 +{
1186 +       asm volatile(
1187 +       "movd  %1, %%mm7        \n\t"
1188 +       "punpckldq %1, %%mm7    \n\t"
1189 +       "mov $-1024, %%"REG_S"  \n\t"
1190 +       ASMALIGN(4)
1191 +       "1:                     \n\t"
1192 +       "movq   1024(%0, %%"REG_S"), %%mm0\n\t"
1193 +       "movq   1032(%0, %%"REG_S"), %%mm1\n\t"
1194 +       "pfadd  %%mm7, %%mm0            \n\t" // common
1195 +       "pfadd  %%mm7, %%mm1            \n\t" // common
1196 +       "movq   %%mm0, %%mm2            \n\t" // common
1197 +       "movq   %%mm1, %%mm3            \n\t" // common
1198 +       "pfadd  (%0, %%"REG_S"), %%mm0  \n\t"
1199 +       "pfadd  8(%0, %%"REG_S"), %%mm1 \n\t"
1200 +       "pfadd  2048(%0, %%"REG_S"), %%mm2\n\t"
1201 +       "pfadd  2056(%0, %%"REG_S"), %%mm3\n\t"
1202 +       "pfadd  3072(%0, %%"REG_S"), %%mm0\n\t"
1203 +       "pfadd  3080(%0, %%"REG_S"), %%mm1\n\t"
1204 +       "pfadd  4096(%0, %%"REG_S"), %%mm2\n\t"
1205 +       "pfadd  4104(%0, %%"REG_S"), %%mm3\n\t"
1206 +       "movq   %%mm0, (%0, %%"REG_S")  \n\t"
1207 +       "movq   %%mm1, 8(%0, %%"REG_S") \n\t"
1208 +       "movq   %%mm2, 1024(%0, %%"REG_S")\n\t"
1209 +       "movq   %%mm3, 1032(%0, %%"REG_S")\n\t"
1210 +       "add $16, %%"REG_S"             \n\t"
1211 +       " jnz 1b                        \n\t"
1212 +       :: "r" (samples+256), "m" (bias)
1213 +       : "%"REG_S
1214 +       );
1215 +}
1216 +
1217 +/* todo: should be optimized better */
1218 +static void mix32toS_3dnow (sample_t * samples, sample_t bias)
1219 +{
1220 +       asm volatile(
1221 +       "mov $-1024, %%"REG_S"          \n\t"
1222 +       ASMALIGN(4)
1223 +       "1:                     \n\t"
1224 +       "movd  %1, %%mm7                \n\t"
1225 +       "punpckldq %1, %%mm7            \n\t"
1226 +       "movq  1024(%0, %%"REG_S"), %%mm0\n\t"
1227 +       "movq  1032(%0, %%"REG_S"), %%mm1\n\t"
1228 +       "movq  3072(%0, %%"REG_S"), %%mm4\n\t"
1229 +       "movq  3080(%0, %%"REG_S"), %%mm5\n\t"
1230 +       "pfadd %%mm7, %%mm0             \n\t" // common
1231 +       "pfadd %%mm7, %%mm1             \n\t" // common
1232 +       "pfadd 4096(%0, %%"REG_S"), %%mm4\n\t" // surround
1233 +       "pfadd 4104(%0, %%"REG_S"), %%mm5\n\t" // surround
1234 +       "movq  (%0, %%"REG_S"), %%mm2   \n\t"
1235 +       "movq  8(%0, %%"REG_S"), %%mm3  \n\t"
1236 +       "movq  2048(%0, %%"REG_S"), %%mm6\n\t"
1237 +       "movq  2056(%0, %%"REG_S"), %%mm7\n\t"
1238 +       "pfsub %%mm4, %%mm2             \n\t"
1239 +       "pfsub %%mm5, %%mm3             \n\t"
1240 +       "pfadd %%mm4, %%mm6             \n\t"
1241 +       "pfadd %%mm5, %%mm7             \n\t"
1242 +       "pfadd %%mm0, %%mm2             \n\t"
1243 +       "pfadd %%mm1, %%mm3             \n\t"
1244 +       "pfadd %%mm0, %%mm6             \n\t"
1245 +       "pfadd %%mm1, %%mm7             \n\t"
1246 +       "movq  %%mm2, (%0, %%"REG_S")   \n\t"
1247 +       "movq  %%mm3, 8(%0, %%"REG_S")  \n\t"
1248 +       "movq  %%mm6, 1024(%0, %%"REG_S")\n\t"
1249 +       "movq  %%mm7, 1032(%0, %%"REG_S")\n\t"
1250 +       "add $16, %%"REG_S"             \n\t"
1251 +       " jnz 1b                        \n\t"
1252 +       :: "r" (samples+256), "m" (bias)
1253 +       : "%"REG_S
1254 +       );
1255 +}
1256 +
1257 +static void move2to1_3dnow (sample_t * src, sample_t * dest, sample_t bias)
1258 +{
1259 +       asm volatile(
1260 +               "movd  %2, %%mm7        \n\t"
1261 +               "punpckldq %2, %%mm7    \n\t"
1262 +               "mov $-1024, %%"REG_S"  \n\t"
1263 +               ASMALIGN(4)
1264 +               "1:                     \n\t"
1265 +               "movq  (%0, %%"REG_S"), %%mm0   \n\t"
1266 +               "movq  8(%0, %%"REG_S"), %%mm1  \n\t"
1267 +               "movq  16(%0, %%"REG_S"), %%mm2 \n\t"
1268 +               "movq  24(%0, %%"REG_S"), %%mm3 \n\t"
1269 +               "pfadd 1024(%0, %%"REG_S"), %%mm0\n\t"
1270 +               "pfadd 1032(%0, %%"REG_S"), %%mm1\n\t"
1271 +               "pfadd 1040(%0, %%"REG_S"), %%mm2\n\t"
1272 +               "pfadd 1048(%0, %%"REG_S"), %%mm3\n\t"
1273 +               "pfadd %%mm7, %%mm0             \n\t"
1274 +               "pfadd %%mm7, %%mm1             \n\t"
1275 +               "pfadd %%mm7, %%mm2             \n\t"
1276 +               "pfadd %%mm7, %%mm3             \n\t"
1277 +               "movq  %%mm0, (%1, %%"REG_S")   \n\t"
1278 +               "movq  %%mm1, 8(%1, %%"REG_S")  \n\t"
1279 +               "movq  %%mm2, 16(%1, %%"REG_S") \n\t"
1280 +               "movq  %%mm3, 24(%1, %%"REG_S") \n\t"
1281 +               "add $32, %%"REG_S"             \n\t"
1282 +               " jnz 1b                        \n\t"
1283 +       :: "r" (src+256), "r" (dest+256), "m" (bias)
1284 +       : "%"REG_S
1285 +       );
1286 +}
1287 +
1288 +static void downmix_3dnow (sample_t * samples, int acmod, int output, sample_t bias,
1289 +             sample_t clev, sample_t slev)
1290 +{
1291 +    switch (CONVERT (acmod, output & A52_CHANNEL_MASK)) {
1292 +
1293 +    case CONVERT (A52_CHANNEL, A52_CHANNEL2):
1294 +       memcpy (samples, samples + 256, 256 * sizeof (sample_t));
1295 +       break;
1296 +
1297 +    case CONVERT (A52_CHANNEL, A52_MONO):
1298 +    case CONVERT (A52_STEREO, A52_MONO):
1299 +    mix_2to1_3dnow:
1300 +       mix2to1_3dnow (samples, samples + 256, bias);
1301 +       break;
1302 +
1303 +    case CONVERT (A52_2F1R, A52_MONO):
1304 +       if (slev == 0)
1305 +           goto mix_2to1_3dnow;
1306 +    case CONVERT (A52_3F, A52_MONO):
1307 +    mix_3to1_3dnow:
1308 +       mix3to1_3dnow (samples, bias);
1309 +       break;
1310 +
1311 +    case CONVERT (A52_3F1R, A52_MONO):
1312 +       if (slev == 0)
1313 +           goto mix_3to1_3dnow;
1314 +    case CONVERT (A52_2F2R, A52_MONO):
1315 +       if (slev == 0)
1316 +           goto mix_2to1_3dnow;
1317 +       mix4to1_3dnow (samples, bias);
1318 +       break;
1319 +
1320 +    case CONVERT (A52_3F2R, A52_MONO):
1321 +       if (slev == 0)
1322 +           goto mix_3to1_3dnow;
1323 +       mix5to1_3dnow (samples, bias);
1324 +       break;
1325 +
1326 +    case CONVERT (A52_MONO, A52_DOLBY):
1327 +       memcpy (samples + 256, samples, 256 * sizeof (sample_t));
1328 +       break;
1329 +
1330 +    case CONVERT (A52_3F, A52_STEREO):
1331 +    case CONVERT (A52_3F, A52_DOLBY):
1332 +    mix_3to2_3dnow:
1333 +       mix3to2_3dnow (samples, bias);
1334 +       break;
1335 +
1336 +    case CONVERT (A52_2F1R, A52_STEREO):
1337 +       if (slev == 0)
1338 +           break;
1339 +       mix21to2_3dnow (samples, samples + 256, bias);
1340 +       break;
1341 +
1342 +    case CONVERT (A52_2F1R, A52_DOLBY):
1343 +       mix21toS_3dnow (samples, bias);
1344 +       break;
1345 +
1346 +    case CONVERT (A52_3F1R, A52_STEREO):
1347 +       if (slev == 0)
1348 +           goto mix_3to2_3dnow;
1349 +       mix31to2_3dnow (samples, bias);
1350 +       break;
1351 +
1352 +    case CONVERT (A52_3F1R, A52_DOLBY):
1353 +       mix31toS_3dnow (samples, bias);
1354 +       break;
1355 +
1356 +    case CONVERT (A52_2F2R, A52_STEREO):
1357 +       if (slev == 0)
1358 +           break;
1359 +       mix2to1_3dnow (samples, samples + 512, bias);
1360 +       mix2to1_3dnow (samples + 256, samples + 768, bias);
1361 +       break;
1362 +
1363 +    case CONVERT (A52_2F2R, A52_DOLBY):
1364 +       mix22toS_3dnow (samples, bias);
1365 +       break;
1366 +
1367 +    case CONVERT (A52_3F2R, A52_STEREO):
1368 +       if (slev == 0)
1369 +           goto mix_3to2_3dnow;
1370 +       mix32to2_3dnow (samples, bias);
1371 +       break;
1372 +
1373 +    case CONVERT (A52_3F2R, A52_DOLBY):
1374 +       mix32toS_3dnow (samples, bias);
1375 +       break;
1376 +
1377 +    case CONVERT (A52_3F1R, A52_3F):
1378 +       if (slev == 0)
1379 +           break;
1380 +       mix21to2_3dnow (samples, samples + 512, bias);
1381 +       break;
1382 +
1383 +    case CONVERT (A52_3F2R, A52_3F):
1384 +       if (slev == 0)
1385 +           break;
1386 +       mix2to1_3dnow (samples, samples + 768, bias);
1387 +       mix2to1_3dnow (samples + 512, samples + 1024, bias);
1388 +       break;
1389 +
1390 +    case CONVERT (A52_3F1R, A52_2F1R):
1391 +       mix3to2_3dnow (samples, bias);
1392 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1393 +       break;
1394 +
1395 +    case CONVERT (A52_2F2R, A52_2F1R):
1396 +       mix2to1_3dnow (samples + 512, samples + 768, bias);
1397 +       break;
1398 +
1399 +    case CONVERT (A52_3F2R, A52_2F1R):
1400 +       mix3to2_3dnow (samples, bias); //FIXME possible bug? (output doesnt seem to be used)
1401 +       move2to1_3dnow (samples + 768, samples + 512, bias);
1402 +       break;
1403 +
1404 +    case CONVERT (A52_3F2R, A52_3F1R):
1405 +       mix2to1_3dnow (samples + 768, samples + 1024, bias);
1406 +       break;
1407 +
1408 +    case CONVERT (A52_2F1R, A52_2F2R):
1409 +       memcpy (samples + 768, samples + 512, 256 * sizeof (sample_t));
1410 +       break;
1411 +
1412 +    case CONVERT (A52_3F1R, A52_2F2R):
1413 +       mix3to2_3dnow (samples, bias);
1414 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1415 +       break;
1416 +
1417 +    case CONVERT (A52_3F2R, A52_2F2R):
1418 +       mix3to2_3dnow (samples, bias);
1419 +       memcpy (samples + 512, samples + 768, 256 * sizeof (sample_t));
1420 +       memcpy (samples + 768, samples + 1024, 256 * sizeof (sample_t));
1421 +       break;
1422 +
1423 +    case CONVERT (A52_3F1R, A52_3F2R):
1424 +       memcpy (samples + 1024, samples + 768, 256 * sizeof (sample_t));
1425 +       break;
1426 +    }
1427 +    __asm __volatile("femms":::"memory");
1428 +}
1429 +
1430 +#endif // ARCH_X86 || ARCH_X86_64
1431 --- liba52/imdct.c      2006-06-12 15:18:27.000000000 +0200
1432 +++ liba52/imdct.c      2006-06-12 19:18:39.000000000 +0200
1433 @@ -26,6 +26,11 @@
1434   * You should have received a copy of the GNU General Public License
1435   * along with this program; if not, write to the Free Software
1436   * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
1437 + *
1438 + * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
1439 + * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
1440 + *   michael did port them from libac3 (untested, perhaps totally broken)
1441 + * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
1442   */
1443
1444  #include "config.h"
1445 @@ -39,12 +48,49 @@
1446  #include "a52.h"
1447  #include "a52_internal.h"
1448  #include "mm_accel.h"
1449 +#include "mangle.h"
1450 +
1451 +void (*a52_imdct_512) (sample_t * data, sample_t * delay, sample_t bias);
1452 +
1453 +#ifdef RUNTIME_CPUDETECT
1454 +#undef HAVE_3DNOWEX
1455 +#endif
1456
1457  typedef struct complex_s {
1458      sample_t real;
1459      sample_t imag;
1460  } complex_t;
1461
1462 +static const int pm128[128] attribute_used __attribute__((aligned(16))) =
1463 +{
1464 +       0, 16, 32, 48, 64, 80,  96, 112,  8, 40, 72, 104, 24, 56,  88, 120,
1465 +       4, 20, 36, 52, 68, 84, 100, 116, 12, 28, 44,  60, 76, 92, 108, 124,
1466 +       2, 18, 34, 50, 66, 82,  98, 114, 10, 42, 74, 106, 26, 58,  90, 122,
1467 +       6, 22, 38, 54, 70, 86, 102, 118, 14, 46, 78, 110, 30, 62,  94, 126,
1468 +       1, 17, 33, 49, 65, 81,  97, 113,  9, 41, 73, 105, 25, 57,  89, 121,
1469 +       5, 21, 37, 53, 69, 85, 101, 117, 13, 29, 45,  61, 77, 93, 109, 125,
1470 +       3, 19, 35, 51, 67, 83,  99, 115, 11, 43, 75, 107, 27, 59,  91, 123,
1471 +       7, 23, 39, 55, 71, 87, 103, 119, 15, 31, 47,  63, 79, 95, 111, 127
1472 +};
1473 +
1474 +static uint8_t attribute_used bit_reverse_512[] = {
1475 +       0x00, 0x40, 0x20, 0x60, 0x10, 0x50, 0x30, 0x70,
1476 +       0x08, 0x48, 0x28, 0x68, 0x18, 0x58, 0x38, 0x78,
1477 +       0x04, 0x44, 0x24, 0x64, 0x14, 0x54, 0x34, 0x74,
1478 +       0x0c, 0x4c, 0x2c, 0x6c, 0x1c, 0x5c, 0x3c, 0x7c,
1479 +       0x02, 0x42, 0x22, 0x62, 0x12, 0x52, 0x32, 0x72,
1480 +       0x0a, 0x4a, 0x2a, 0x6a, 0x1a, 0x5a, 0x3a, 0x7a,
1481 +       0x06, 0x46, 0x26, 0x66, 0x16, 0x56, 0x36, 0x76,
1482 +       0x0e, 0x4e, 0x2e, 0x6e, 0x1e, 0x5e, 0x3e, 0x7e,
1483 +       0x01, 0x41, 0x21, 0x61, 0x11, 0x51, 0x31, 0x71,
1484 +       0x09, 0x49, 0x29, 0x69, 0x19, 0x59, 0x39, 0x79,
1485 +       0x05, 0x45, 0x25, 0x65, 0x15, 0x55, 0x35, 0x75,
1486 +       0x0d, 0x4d, 0x2d, 0x6d, 0x1d, 0x5d, 0x3d, 0x7d,
1487 +       0x03, 0x43, 0x23, 0x63, 0x13, 0x53, 0x33, 0x73,
1488 +       0x0b, 0x4b, 0x2b, 0x6b, 0x1b, 0x5b, 0x3b, 0x7b,
1489 +       0x07, 0x47, 0x27, 0x67, 0x17, 0x57, 0x37, 0x77,
1490 +       0x0f, 0x4f, 0x2f, 0x6f, 0x1f, 0x5f, 0x3f, 0x7f};
1491 +
1492  static uint8_t fftorder[] = {
1493        0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
1494        8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
1495 @@ -56,6 +102,40 @@
1496        6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
1497  };
1498
1499 +static complex_t __attribute__((aligned(16))) buf[128];
1500 +
1501 +/* Twiddle factor LUT */
1502 +static complex_t __attribute__((aligned(16))) w_1[1];
1503 +static complex_t __attribute__((aligned(16))) w_2[2];
1504 +static complex_t __attribute__((aligned(16))) w_4[4];
1505 +static complex_t __attribute__((aligned(16))) w_8[8];
1506 +static complex_t __attribute__((aligned(16))) w_16[16];
1507 +static complex_t __attribute__((aligned(16))) w_32[32];
1508 +static complex_t __attribute__((aligned(16))) w_64[64];
1509 +static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
1510 +
1511 +/* Twiddle factors for IMDCT */
1512 +static sample_t __attribute__((aligned(16))) xcos1[128];
1513 +static sample_t __attribute__((aligned(16))) xsin1[128];
1514 +
1515 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
1516 +// NOTE: SSE needs 16byte alignment or it will segfault
1517 +//
1518 +static float __attribute__((aligned(16))) sseSinCos1c[256];
1519 +static float __attribute__((aligned(16))) sseSinCos1d[256];
1520 +static float attribute_used __attribute__((aligned(16))) ps111_1[4]={1,1,1,-1};
1521 +//static float __attribute__((aligned(16))) sseW0[4];
1522 +static float __attribute__((aligned(16))) sseW1[8];
1523 +static float __attribute__((aligned(16))) sseW2[16];
1524 +static float __attribute__((aligned(16))) sseW3[32];
1525 +static float __attribute__((aligned(16))) sseW4[64];
1526 +static float __attribute__((aligned(16))) sseW5[128];
1527 +static float __attribute__((aligned(16))) sseW6[256];
1528 +static float __attribute__((aligned(16))) *sseW[7]=
1529 +       {NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
1530 +static float __attribute__((aligned(16))) sseWindow[512];
1531 +#endif
1532 +
1533  /* Root values for IFFT */
1534  static sample_t roots16[3];
1535  static sample_t roots32[7];
1536 @@ -241,7 +321,7 @@
1537      ifft_pass (buf, roots128 - 32, 32);
1538  }
1539
1540 -void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
1541 +void imdct_do_512 (sample_t * data, sample_t * delay, sample_t bias)
1542  {
1543      int i, k;
1544      sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
1545 @@ -285,6 +365,714 @@
1546      }
1547  }
1548
1549 +#ifdef HAVE_ALTIVEC
1550 +
1551 +#ifdef HAVE_ALTIVEC_H
1552 +#include <altivec.h>
1553 +#endif
1554 +
1555 +// used to build registers permutation vectors (vcprm)
1556 +// the 's' are for words in the _s_econd vector
1557 +#define WORD_0 0x00,0x01,0x02,0x03
1558 +#define WORD_1 0x04,0x05,0x06,0x07
1559 +#define WORD_2 0x08,0x09,0x0a,0x0b
1560 +#define WORD_3 0x0c,0x0d,0x0e,0x0f
1561 +#define WORD_s0 0x10,0x11,0x12,0x13
1562 +#define WORD_s1 0x14,0x15,0x16,0x17
1563 +#define WORD_s2 0x18,0x19,0x1a,0x1b
1564 +#define WORD_s3 0x1c,0x1d,0x1e,0x1f
1565 +
1566 +#ifdef __APPLE_CC__
1567 +#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
1568 +#else
1569 +#define vcprm(a,b,c,d) (const vector unsigned char){WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d}
1570 +#endif
1571 +
1572 +// vcprmle is used to keep the same index as in the SSE version.
1573 +// it's the same as vcprm, with the index inversed
1574 +// ('le' is Little Endian)
1575 +#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
1576 +
1577 +// used to build inverse/identity vectors (vcii)
1578 +// n is _n_egative, p is _p_ositive
1579 +#define FLOAT_n -1.
1580 +#define FLOAT_p 1.
1581 +
1582 +#ifdef __APPLE_CC__
1583 +#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
1584 +#else
1585 +#define vcii(a,b,c,d) (const vector float){FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d}
1586 +#endif
1587 +
1588 +#ifdef __APPLE_CC__
1589 +#define FOUROF(a) (a)
1590 +#else
1591 +#define FOUROF(a) {a,a,a,a}
1592 +#endif
1593 +
1594 +
1595 +void
1596 +imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
1597 +{
1598 +  int i;
1599 +  int k;
1600 +  int p,q;
1601 +  int m;
1602 +  long two_m;
1603 +  long two_m_plus_one;
1604 +
1605 +  sample_t tmp_b_i;
1606 +  sample_t tmp_b_r;
1607 +  sample_t tmp_a_i;
1608 +  sample_t tmp_a_r;
1609 +
1610 +  sample_t *data_ptr;
1611 +  sample_t *delay_ptr;
1612 +  sample_t *window_ptr;
1613 +
1614 +  /* 512 IMDCT with source and dest data in 'data' */
1615 +
1616 +  /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
1617 +  for( i=0; i < 128; i++) {
1618 +    /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */
1619 +    int j= bit_reverse_512[i];
1620 +    buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
1621 +    buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
1622 +  }
1623 +
1624 +  /* 1. iteration */
1625 +  for(i = 0; i < 128; i += 2) {
1626 +#if 0
1627 +    tmp_a_r = buf[i].real;
1628 +    tmp_a_i = buf[i].imag;
1629 +    tmp_b_r = buf[i+1].real;
1630 +    tmp_b_i = buf[i+1].imag;
1631 +    buf[i].real = tmp_a_r + tmp_b_r;
1632 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1633 +    buf[i+1].real = tmp_a_r - tmp_b_r;
1634 +    buf[i+1].imag =  tmp_a_i - tmp_b_i;
1635 +#else
1636 +    vector float temp, bufv;
1637 +
1638 +    bufv = vec_ld(i << 3, (float*)buf);
1639 +    temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
1640 +    bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
1641 +    vec_st(bufv, i << 3, (float*)buf);
1642 +#endif
1643 +  }
1644 +
1645 +  /* 2. iteration */
1646 +  // Note w[1]={{1,0}, {0,-1}}
1647 +  for(i = 0; i < 128; i += 4) {
1648 +#if 0
1649 +    tmp_a_r = buf[i].real;
1650 +    tmp_a_i = buf[i].imag;
1651 +    tmp_b_r = buf[i+2].real;
1652 +    tmp_b_i = buf[i+2].imag;
1653 +    buf[i].real = tmp_a_r + tmp_b_r;
1654 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1655 +    buf[i+2].real = tmp_a_r - tmp_b_r;
1656 +    buf[i+2].imag =  tmp_a_i - tmp_b_i;
1657 +    tmp_a_r = buf[i+1].real;
1658 +    tmp_a_i = buf[i+1].imag;
1659 +    /* WARNING: im <-> re here ! */
1660 +    tmp_b_r = buf[i+3].imag;
1661 +    tmp_b_i = buf[i+3].real;
1662 +    buf[i+1].real = tmp_a_r + tmp_b_r;
1663 +    buf[i+1].imag =  tmp_a_i - tmp_b_i;
1664 +    buf[i+3].real = tmp_a_r - tmp_b_r;
1665 +    buf[i+3].imag =  tmp_a_i + tmp_b_i;
1666 +#else
1667 +    vector float buf01, buf23, temp1, temp2;
1668 +
1669 +    buf01 = vec_ld((i + 0) << 3, (float*)buf);
1670 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1671 +    buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
1672 +
1673 +    temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
1674 +    temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
1675 +
1676 +    vec_st(temp1, (i + 0) << 3, (float*)buf);
1677 +    vec_st(temp2, (i + 2) << 3, (float*)buf);
1678 +#endif
1679 +  }
1680 +
1681 +  /* 3. iteration */
1682 +  for(i = 0; i < 128; i += 8) {
1683 +#if 0
1684 +    tmp_a_r = buf[i].real;
1685 +    tmp_a_i = buf[i].imag;
1686 +    tmp_b_r = buf[i+4].real;
1687 +    tmp_b_i = buf[i+4].imag;
1688 +    buf[i].real = tmp_a_r + tmp_b_r;
1689 +    buf[i].imag =  tmp_a_i + tmp_b_i;
1690 +    buf[i+4].real = tmp_a_r - tmp_b_r;
1691 +    buf[i+4].imag =  tmp_a_i - tmp_b_i;
1692 +    tmp_a_r = buf[1+i].real;
1693 +    tmp_a_i = buf[1+i].imag;
1694 +    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
1695 +    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
1696 +    buf[1+i].real = tmp_a_r + tmp_b_r;
1697 +    buf[1+i].imag =  tmp_a_i + tmp_b_i;
1698 +    buf[i+5].real = tmp_a_r - tmp_b_r;
1699 +    buf[i+5].imag =  tmp_a_i - tmp_b_i;
1700 +    tmp_a_r = buf[i+2].real;
1701 +    tmp_a_i = buf[i+2].imag;
1702 +    /* WARNING re <-> im & sign */
1703 +    tmp_b_r = buf[i+6].imag;
1704 +    tmp_b_i = - buf[i+6].real;
1705 +    buf[i+2].real = tmp_a_r + tmp_b_r;
1706 +    buf[i+2].imag =  tmp_a_i + tmp_b_i;
1707 +    buf[i+6].real = tmp_a_r - tmp_b_r;
1708 +    buf[i+6].imag =  tmp_a_i - tmp_b_i;
1709 +    tmp_a_r = buf[i+3].real;
1710 +    tmp_a_i = buf[i+3].imag;
1711 +    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
1712 +    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
1713 +    buf[i+3].real = tmp_a_r + tmp_b_r;
1714 +    buf[i+3].imag =  tmp_a_i + tmp_b_i;
1715 +    buf[i+7].real = tmp_a_r - tmp_b_r;
1716 +    buf[i+7].imag =  tmp_a_i - tmp_b_i;
1717 +#else
1718 +    vector float buf01, buf23, buf45, buf67;
1719 +
1720 +    buf01 = vec_ld((i + 0) << 3, (float*)buf);
1721 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1722 +
1723 +    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
1724 +    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
1725 +    buf[i+5].real = tmp_b_r;
1726 +    buf[i+5].imag = tmp_b_i;
1727 +    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
1728 +    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
1729 +    buf[i+7].real = tmp_b_r;
1730 +    buf[i+7].imag = tmp_b_i;
1731 +
1732 +    buf23 = vec_ld((i + 2) << 3, (float*)buf);
1733 +    buf45 = vec_ld((i + 4) << 3, (float*)buf);
1734 +    buf67 = vec_ld((i + 6) << 3, (float*)buf);
1735 +    buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
1736 +
1737 +    vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
1738 +    vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
1739 +    vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
1740 +    vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
1741 +#endif
1742 +  }
1743 +
1744 +  /* 4-7. iterations */
1745 +  for (m=3; m < 7; m++) {
1746 +    two_m = (1 << m);
1747 +
1748 +    two_m_plus_one = two_m<<1;
1749 +
1750 +    for(i = 0; i < 128; i += two_m_plus_one) {
1751 +      for(k = 0; k < two_m; k+=2) {
1752 +#if 0
1753 +        int p = k + i;
1754 +        int q = p + two_m;
1755 +        tmp_a_r = buf[p].real;
1756 +        tmp_a_i = buf[p].imag;
1757 +        tmp_b_r =
1758 +          buf[q].real * w[m][k].real -
1759 +          buf[q].imag * w[m][k].imag;
1760 +        tmp_b_i =
1761 +          buf[q].imag * w[m][k].real +
1762 +          buf[q].real * w[m][k].imag;
1763 +        buf[p].real = tmp_a_r + tmp_b_r;
1764 +        buf[p].imag =  tmp_a_i + tmp_b_i;
1765 +        buf[q].real = tmp_a_r - tmp_b_r;
1766 +        buf[q].imag =  tmp_a_i - tmp_b_i;
1767 +
1768 +        tmp_a_r = buf[(p + 1)].real;
1769 +        tmp_a_i = buf[(p + 1)].imag;
1770 +        tmp_b_r =
1771 +          buf[(q + 1)].real * w[m][(k + 1)].real -
1772 +          buf[(q + 1)].imag * w[m][(k + 1)].imag;
1773 +        tmp_b_i =
1774 +          buf[(q + 1)].imag * w[m][(k + 1)].real +
1775 +          buf[(q + 1)].real * w[m][(k + 1)].imag;
1776 +        buf[(p + 1)].real = tmp_a_r + tmp_b_r;
1777 +        buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
1778 +        buf[(q + 1)].real = tmp_a_r - tmp_b_r;
1779 +        buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
1780 +#else
1781 +        int p = k + i;
1782 +        int q = p + two_m;
1783 +        vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
1784 +        const vector float vczero = (const vector float)FOUROF(0.);
1785 +        // first compute buf[q] and buf[q+1]
1786 +        vecq = vec_ld(q << 3, (float*)buf);
1787 +        vecw = vec_ld(0, (float*)&(w[m][k]));
1788 +        temp1 = vec_madd(vecq, vecw, vczero);
1789 +        temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
1790 +        temp2 = vec_madd(temp2, vecw, vczero);
1791 +        temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
1792 +        temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
1793 +        vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
1794 +        // then butterfly with buf[p] and buf[p+1]
1795 +        vecp = vec_ld(p << 3, (float*)buf);
1796 +
1797 +        temp1 = vec_add(vecp, vecq);
1798 +        temp2 = vec_sub(vecp, vecq);
1799 +
1800 +        vec_st(temp1, p << 3, (float*)buf);
1801 +        vec_st(temp2, q << 3, (float*)buf);
1802 +#endif
1803 +      }
1804 +    }
1805 +  }
1806 +
1807 +  /* Post IFFT complex multiply  plus IFFT complex conjugate*/
1808 +  for( i=0; i < 128; i+=4) {
1809 +    /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
1810 +#if 0
1811 +    tmp_a_r =        buf[(i + 0)].real;
1812 +    tmp_a_i = -1.0 * buf[(i + 0)].imag;
1813 +    buf[(i + 0)].real =
1814 +      (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
1815 +    buf[(i + 0)].imag =
1816 +      (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
1817 +
1818 +    tmp_a_r =        buf[(i + 1)].real;
1819 +    tmp_a_i = -1.0 * buf[(i + 1)].imag;
1820 +    buf[(i + 1)].real =
1821 +      (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
1822 +    buf[(i + 1)].imag =
1823 +      (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
1824 +
1825 +    tmp_a_r =        buf[(i + 2)].real;
1826 +    tmp_a_i = -1.0 * buf[(i + 2)].imag;
1827 +    buf[(i + 2)].real =
1828 +      (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
1829 +    buf[(i + 2)].imag =
1830 +      (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
1831 +
1832 +    tmp_a_r =        buf[(i + 3)].real;
1833 +    tmp_a_i = -1.0 * buf[(i + 3)].imag;
1834 +    buf[(i + 3)].real =
1835 +      (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
1836 +    buf[(i + 3)].imag =
1837 +      (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
1838 +#else
1839 +    vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
1840 +    vector float temp0022, temp1133, tempCS01;
1841 +    const vector float vczero = (const vector float)FOUROF(0.);
1842 +
1843 +    bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
1844 +    bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
1845 +
1846 +    cosv = vec_ld(i << 2, xcos1);
1847 +    sinv = vec_ld(i << 2, xsin1);
1848 +
1849 +    temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
1850 +    temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
1851 +    tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
1852 +    temp1 = vec_madd(temp0022, tempCS01, vczero);
1853 +    tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
1854 +    temp2 = vec_madd(temp1133, tempCS01, vczero);
1855 +    bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
1856 +
1857 +    vec_st(bufv_0, (i + 0) << 3, (float*)buf);
1858 +
1859 +    /* idem with bufv_2 and high-order cosv/sinv */
1860 +
1861 +    temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
1862 +    temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
1863 +    tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
1864 +    temp1 = vec_madd(temp0022, tempCS01, vczero);
1865 +    tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
1866 +    temp2 = vec_madd(temp1133, tempCS01, vczero);
1867 +    bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
1868 +
1869 +    vec_st(bufv_2, (i + 2) << 3, (float*)buf);
1870 +
1871 +#endif
1872 +  }
1873 +
1874 +  data_ptr = data;
1875 +  delay_ptr = delay;
1876 +  window_ptr = a52_imdct_window;
1877 +
1878 +  /* Window and convert to real valued signal */
1879 +  for(i=0; i< 64; i++) {
1880 +    *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias;
1881 +    *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias;
1882 +  }
1883 +
1884 +  for(i=0; i< 64; i++) {
1885 +    *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias;
1886 +    *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias;
1887 +  }
1888 +
1889 +  /* The trailing edge of the window goes into the delay line */
1890 +  delay_ptr = delay;
1891 +
1892 +  for(i=0; i< 64; i++) {
1893 +    *delay_ptr++  = -buf[64+i].real   * *--window_ptr;
1894 +    *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr;
1895 +  }
1896 +
1897 +  for(i=0; i<64; i++) {
1898 +    *delay_ptr++  =  buf[i].imag       * *--window_ptr;
1899 +    *delay_ptr++  = -buf[128-i-1].real * *--window_ptr;
1900 +  }
1901 +}
1902 +#endif
1903 +
1904 +
1905 +// Stuff below this line is borrowed from libac3
1906 +#include "srfftp.h"
1907 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
1908 +#ifndef HAVE_3DNOW
1909 +#define HAVE_3DNOW 1
1910 +#endif
1911 +#include "srfftp_3dnow.h"
1912 +
1913 +const i_cmplx_t x_plus_minus_3dnow __attribute__ ((aligned (8))) = {{ 0x00000000UL, 0x80000000UL }};
1914 +const i_cmplx_t x_minus_plus_3dnow __attribute__ ((aligned (8))) = {{ 0x80000000UL, 0x00000000UL }};
1915 +const complex_t HSQRT2_3DNOW __attribute__ ((aligned (8))) = { 0.707106781188, 0.707106781188 };
1916 +
1917 +#undef HAVE_3DNOWEX
1918 +#include "imdct_3dnow.h"
1919 +#define HAVE_3DNOWEX
1920 +#include "imdct_3dnow.h"
1921 +
1922 +void
1923 +imdct_do_512_sse(sample_t data[],sample_t delay[], sample_t bias)
1924 +{
1925 +/*     int i,k;
1926 +    int p,q;*/
1927 +    int m;
1928 +    long two_m;
1929 +    long two_m_plus_one;
1930 +    long two_m_plus_one_shl3;
1931 +    complex_t *buf_offset;
1932 +
1933 +/*  sample_t tmp_a_i;
1934 +    sample_t tmp_a_r;
1935 +    sample_t tmp_b_i;
1936 +    sample_t tmp_b_r;*/
1937 +
1938 +    sample_t *data_ptr;
1939 +    sample_t *delay_ptr;
1940 +    sample_t *window_ptr;
1941 +
1942 +    /* 512 IMDCT with source and dest data in 'data' */
1943 +    /* see the c version (dct_do_512()), its allmost identical, just in C */
1944 +
1945 +    /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
1946 +    /* Bit reversed shuffling */
1947 +       asm volatile(
1948 +               "xor %%"REG_S", %%"REG_S"               \n\t"
1949 +               "lea "MANGLE(bit_reverse_512)", %%"REG_a"\n\t"
1950 +               "mov $1008, %%"REG_D"                   \n\t"
1951 +               "push %%"REG_BP"                        \n\t" //use ebp without telling gcc
1952 +               ASMALIGN(4)
1953 +               "1:                                     \n\t"
1954 +               "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // XXXI
1955 +               "movhps 8(%0, %%"REG_D"), %%xmm0        \n\t" // RXXI
1956 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // XXXi
1957 +               "movhps (%0, %%"REG_D"), %%xmm1 \n\t" // rXXi
1958 +               "shufps $0x33, %%xmm1, %%xmm0           \n\t" // irIR
1959 +               "movaps "MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm2\n\t"
1960 +               "mulps %%xmm0, %%xmm2                   \n\t"
1961 +               "shufps $0xB1, %%xmm0, %%xmm0           \n\t" // riRI
1962 +               "mulps "MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
1963 +               "subps %%xmm0, %%xmm2                   \n\t"
1964 +               "movzb (%%"REG_a"), %%"REG_d"           \n\t"
1965 +               "movzb 1(%%"REG_a"), %%"REG_BP"         \n\t"
1966 +               "movlps %%xmm2, (%1, %%"REG_d", 8)      \n\t"
1967 +               "movhps %%xmm2, (%1, %%"REG_BP", 8)     \n\t"
1968 +               "add $16, %%"REG_S"                     \n\t"
1969 +               "add $2, %%"REG_a"                      \n\t" // avoid complex addressing for P4 crap
1970 +               "sub $16, %%"REG_D"                     \n\t"
1971 +               "jnc 1b                                 \n\t"
1972 +               "pop %%"REG_BP"                         \n\t"//no we didnt touch ebp *g*
1973 +               :: "b" (data), "c" (buf)
1974 +               : "%"REG_S, "%"REG_D, "%"REG_a, "%"REG_d
1975 +       );
1976 +
1977 +
1978 +    /* FFT Merge */
1979 +/* unoptimized variant
1980 +    for (m=1; m < 7; m++) {
1981 +       if(m)
1982 +           two_m = (1 << m);
1983 +       else
1984 +           two_m = 1;
1985 +
1986 +       two_m_plus_one = (1 << (m+1));
1987 +
1988 +       for(i = 0; i < 128; i += two_m_plus_one) {
1989 +           for(k = 0; k < two_m; k++) {
1990 +               p = k + i;
1991 +               q = p + two_m;
1992 +               tmp_a_r = buf[p].real;
1993 +               tmp_a_i = buf[p].imag;
1994 +               tmp_b_r = buf[q].real * w[m][k].real - buf[q].imag * w[m][k].imag;
1995 +               tmp_b_i = buf[q].imag * w[m][k].real + buf[q].real * w[m][k].imag;
1996 +               buf[p].real = tmp_a_r + tmp_b_r;
1997 +               buf[p].imag =  tmp_a_i + tmp_b_i;
1998 +               buf[q].real = tmp_a_r - tmp_b_r;
1999 +               buf[q].imag =  tmp_a_i - tmp_b_i;
2000 +           }
2001 +       }
2002 +    }
2003 +*/
2004 +
2005 +    /* 1. iteration */
2006 +       // Note w[0][0]={1,0}
2007 +       asm volatile(
2008 +               "xorps %%xmm1, %%xmm1   \n\t"
2009 +               "xorps %%xmm2, %%xmm2   \n\t"
2010 +               "mov %0, %%"REG_S"      \n\t"
2011 +               ASMALIGN(4)
2012 +               "1:                     \n\t"
2013 +               "movlps (%%"REG_S"), %%xmm0\n\t" //buf[p]
2014 +               "movlps 8(%%"REG_S"), %%xmm1\n\t" //buf[q]
2015 +               "movhps (%%"REG_S"), %%xmm0\n\t" //buf[p]
2016 +               "movhps 8(%%"REG_S"), %%xmm2\n\t" //buf[q]
2017 +               "addps %%xmm1, %%xmm0   \n\t"
2018 +               "subps %%xmm2, %%xmm0   \n\t"
2019 +               "movaps %%xmm0, (%%"REG_S")\n\t"
2020 +               "add $16, %%"REG_S"     \n\t"
2021 +               "cmp %1, %%"REG_S"      \n\t"
2022 +               " jb 1b                 \n\t"
2023 +               :: "g" (buf), "r" (buf + 128)
2024 +               : "%"REG_S
2025 +       );
2026 +
2027 +    /* 2. iteration */
2028 +       // Note w[1]={{1,0}, {0,-1}}
2029 +       asm volatile(
2030 +               "movaps "MANGLE(ps111_1)", %%xmm7\n\t" // 1,1,1,-1
2031 +               "mov %0, %%"REG_S"              \n\t"
2032 +               ASMALIGN(4)
2033 +               "1:                             \n\t"
2034 +               "movaps 16(%%"REG_S"), %%xmm2   \n\t" //r2,i2,r3,i3
2035 +               "shufps $0xB4, %%xmm2, %%xmm2   \n\t" //r2,i2,i3,r3
2036 +               "mulps %%xmm7, %%xmm2           \n\t" //r2,i2,i3,-r3
2037 +               "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
2038 +               "movaps (%%"REG_S"), %%xmm1     \n\t" //r0,i0,r1,i1
2039 +               "addps %%xmm2, %%xmm0           \n\t"
2040 +               "subps %%xmm2, %%xmm1           \n\t"
2041 +               "movaps %%xmm0, (%%"REG_S")     \n\t"
2042 +               "movaps %%xmm1, 16(%%"REG_S")   \n\t"
2043 +               "add $32, %%"REG_S"     \n\t"
2044 +               "cmp %1, %%"REG_S"      \n\t"
2045 +               " jb 1b                 \n\t"
2046 +               :: "g" (buf), "r" (buf + 128)
2047 +               : "%"REG_S
2048 +       );
2049 +
2050 +    /* 3. iteration */
2051 +/*
2052 + Note sseW2+0={1,1,sqrt(2),sqrt(2))
2053 + Note sseW2+16={0,0,sqrt(2),-sqrt(2))
2054 + Note sseW2+32={0,0,-sqrt(2),-sqrt(2))
2055 + Note sseW2+48={1,-1,sqrt(2),-sqrt(2))
2056 +*/
2057 +       asm volatile(
2058 +               "movaps 48+"MANGLE(sseW2)", %%xmm6\n\t"
2059 +               "movaps 16+"MANGLE(sseW2)", %%xmm7\n\t"
2060 +               "xorps %%xmm5, %%xmm5           \n\t"
2061 +               "xorps %%xmm2, %%xmm2           \n\t"
2062 +               "mov %0, %%"REG_S"              \n\t"
2063 +               ASMALIGN(4)
2064 +               "1:                             \n\t"
2065 +               "movaps 32(%%"REG_S"), %%xmm2   \n\t" //r4,i4,r5,i5
2066 +               "movaps 48(%%"REG_S"), %%xmm3   \n\t" //r6,i6,r7,i7
2067 +               "movaps "MANGLE(sseW2)", %%xmm4 \n\t" //r4,i4,r5,i5
2068 +               "movaps 32+"MANGLE(sseW2)", %%xmm5\n\t" //r6,i6,r7,i7
2069 +               "mulps %%xmm2, %%xmm4           \n\t"
2070 +               "mulps %%xmm3, %%xmm5           \n\t"
2071 +               "shufps $0xB1, %%xmm2, %%xmm2   \n\t" //i4,r4,i5,r5
2072 +               "shufps $0xB1, %%xmm3, %%xmm3   \n\t" //i6,r6,i7,r7
2073 +               "mulps %%xmm6, %%xmm3           \n\t"
2074 +               "mulps %%xmm7, %%xmm2           \n\t"
2075 +               "movaps (%%"REG_S"), %%xmm0     \n\t" //r0,i0,r1,i1
2076 +               "movaps 16(%%"REG_S"), %%xmm1   \n\t" //r2,i2,r3,i3
2077 +               "addps %%xmm4, %%xmm2           \n\t"
2078 +               "addps %%xmm5, %%xmm3           \n\t"
2079 +               "movaps %%xmm2, %%xmm4          \n\t"
2080 +               "movaps %%xmm3, %%xmm5          \n\t"
2081 +               "addps %%xmm0, %%xmm2           \n\t"
2082 +               "addps %%xmm1, %%xmm3           \n\t"
2083 +               "subps %%xmm4, %%xmm0           \n\t"
2084 +               "subps %%xmm5, %%xmm1           \n\t"
2085 +               "movaps %%xmm2, (%%"REG_S")     \n\t"
2086 +               "movaps %%xmm3, 16(%%"REG_S")   \n\t"
2087 +               "movaps %%xmm0, 32(%%"REG_S")   \n\t"
2088 +               "movaps %%xmm1, 48(%%"REG_S")   \n\t"
2089 +               "add $64, %%"REG_S"     \n\t"
2090 +               "cmp %1, %%"REG_S"      \n\t"
2091 +               " jb 1b                 \n\t"
2092 +               :: "g" (buf), "r" (buf + 128)
2093 +               : "%"REG_S
2094 +       );
2095 +
2096 +    /* 4-7. iterations */
2097 +    for (m=3; m < 7; m++) {
2098 +       two_m = (1 << m);
2099 +       two_m_plus_one = two_m<<1;
2100 +       two_m_plus_one_shl3 = (two_m_plus_one<<3);
2101 +       buf_offset = buf+128;
2102 +       asm volatile(
2103 +               "mov %0, %%"REG_S"                      \n\t"
2104 +               ASMALIGN(4)
2105 +               "1:                                     \n\t"
2106 +               "xor %%"REG_D", %%"REG_D"               \n\t" // k
2107 +               "lea (%%"REG_S", %3), %%"REG_d"         \n\t"
2108 +               "2:                                     \n\t"
2109 +               "movaps (%%"REG_d", %%"REG_D"), %%xmm1  \n\t"
2110 +               "movaps (%4, %%"REG_D", 2), %%xmm2      \n\t"
2111 +               "mulps %%xmm1, %%xmm2                   \n\t"
2112 +               "shufps $0xB1, %%xmm1, %%xmm1           \n\t"
2113 +               "mulps 16(%4, %%"REG_D", 2), %%xmm1     \n\t"
2114 +               "movaps (%%"REG_S", %%"REG_D"), %%xmm0  \n\t"
2115 +               "addps %%xmm2, %%xmm1                   \n\t"
2116 +               "movaps %%xmm1, %%xmm2                  \n\t"
2117 +               "addps %%xmm0, %%xmm1                   \n\t"
2118 +               "subps %%xmm2, %%xmm0                   \n\t"
2119 +               "movaps %%xmm1, (%%"REG_S", %%"REG_D")  \n\t"
2120 +               "movaps %%xmm0, (%%"REG_d", %%"REG_D")  \n\t"
2121 +               "add $16, %%"REG_D"                     \n\t"
2122 +               "cmp %3, %%"REG_D"                      \n\t" //FIXME (opt) count against 0
2123 +               "jb 2b                                  \n\t"
2124 +               "add %2, %%"REG_S"                      \n\t"
2125 +               "cmp %1, %%"REG_S"                      \n\t"
2126 +               " jb 1b                                 \n\t"
2127 +               :: "g" (buf), "m" (buf_offset), "m" (two_m_plus_one_shl3), "r" (two_m<<3),
2128 +                  "r" (sseW[m])
2129 +               : "%"REG_S, "%"REG_D, "%"REG_d
2130 +       );
2131 +    }
2132 +
2133 +    /* Post IFFT complex multiply  plus IFFT complex conjugate*/
2134 +       asm volatile(
2135 +               "mov $-1024, %%"REG_S"                  \n\t"
2136 +               ASMALIGN(4)
2137 +               "1:                                     \n\t"
2138 +               "movaps (%0, %%"REG_S"), %%xmm0         \n\t"
2139 +               "movaps (%0, %%"REG_S"), %%xmm1         \n\t"
2140 +               "shufps $0xB1, %%xmm0, %%xmm0           \n\t"
2141 +               "mulps 1024+"MANGLE(sseSinCos1c)"(%%"REG_S"), %%xmm1\n\t"
2142 +               "mulps 1024+"MANGLE(sseSinCos1d)"(%%"REG_S"), %%xmm0\n\t"
2143 +               "addps %%xmm1, %%xmm0                   \n\t"
2144 +               "movaps %%xmm0, (%0, %%"REG_S")         \n\t"
2145 +               "add $16, %%"REG_S"                     \n\t"
2146 +               " jnz 1b                                \n\t"
2147 +               :: "r" (buf+128)
2148 +               : "%"REG_S
2149 +       );
2150 +
2151 +
2152 +    data_ptr = data;
2153 +    delay_ptr = delay;
2154 +    window_ptr = a52_imdct_window;
2155 +
2156 +    /* Window and convert to real valued signal */
2157 +       asm volatile(
2158 +               "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
2159 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2160 +               "movss %3, %%xmm2                       \n\t"  // bias
2161 +               "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
2162 +               ASMALIGN(4)
2163 +               "1:                                     \n\t"
2164 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? A ?
2165 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
2166 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
2167 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
2168 +               "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
2169 +               "mulps "MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2170 +               "addps (%2, %%"REG_S"), %%xmm0          \n\t"
2171 +               "addps %%xmm2, %%xmm0                   \n\t"
2172 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2173 +               "add  $16, %%"REG_S"                    \n\t"
2174 +               "sub  $16, %%"REG_D"                    \n\t"
2175 +               "cmp  $512, %%"REG_S"                   \n\t"
2176 +               " jb 1b                                 \n\t"
2177 +               :: "r" (buf+64), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
2178 +               : "%"REG_S, "%"REG_D
2179 +       );
2180 +       data_ptr+=128;
2181 +       delay_ptr+=128;
2182 +//     window_ptr+=128;
2183 +
2184 +       asm volatile(
2185 +               "mov $1024, %%"REG_D"                   \n\t"  // 512
2186 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2187 +               "movss %3, %%xmm2                       \n\t"  // bias
2188 +               "shufps $0x00, %%xmm2, %%xmm2           \n\t"  // bias, bias, ...
2189 +               ASMALIGN(4)
2190 +               "1:                                     \n\t"
2191 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
2192 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
2193 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
2194 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
2195 +               "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
2196 +               "mulps 512+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2197 +               "addps (%2, %%"REG_S"), %%xmm0          \n\t"
2198 +               "addps %%xmm2, %%xmm0                   \n\t"
2199 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2200 +               "add $16, %%"REG_S"                     \n\t"
2201 +               "sub $16, %%"REG_D"                     \n\t"
2202 +               "cmp $512, %%"REG_S"                    \n\t"
2203 +               " jb 1b                                 \n\t"
2204 +               :: "r" (buf), "r" (data_ptr), "r" (delay_ptr), "m" (bias)
2205 +               : "%"REG_S, "%"REG_D
2206 +       );
2207 +       data_ptr+=128;
2208 +//     window_ptr+=128;
2209 +
2210 +    /* The trailing edge of the window goes into the delay line */
2211 +    delay_ptr = delay;
2212 +
2213 +       asm volatile(
2214 +               "xor %%"REG_D", %%"REG_D"               \n\t"  // 0
2215 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2216 +               ASMALIGN(4)
2217 +               "1:                                     \n\t"
2218 +               "movlps (%0, %%"REG_S"), %%xmm0         \n\t" // ? ? ? A
2219 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? ? C
2220 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // D ? ? C
2221 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // B ? ? A
2222 +               "shufps $0xCC, %%xmm1, %%xmm0           \n\t" // D C B A
2223 +               "mulps 1024+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2224 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2225 +               "add $16, %%"REG_S"                     \n\t"
2226 +               "sub $16, %%"REG_D"                     \n\t"
2227 +               "cmp $512, %%"REG_S"                    \n\t"
2228 +               " jb 1b                                 \n\t"
2229 +               :: "r" (buf+64), "r" (delay_ptr)
2230 +               : "%"REG_S, "%"REG_D
2231 +       );
2232 +       delay_ptr+=128;
2233 +//     window_ptr-=128;
2234 +
2235 +       asm volatile(
2236 +               "mov $1024, %%"REG_D"                   \n\t"  // 1024
2237 +               "xor %%"REG_S", %%"REG_S"               \n\t"  // 0
2238 +               ASMALIGN(4)
2239 +               "1:                                     \n\t"
2240 +               "movlps (%0, %%"REG_S"), %%xmm0 \n\t" // ? ? A ?
2241 +               "movlps 8(%0, %%"REG_S"), %%xmm1        \n\t" // ? ? C ?
2242 +               "movhps -16(%0, %%"REG_D"), %%xmm1      \n\t" // ? D C ?
2243 +               "movhps -8(%0, %%"REG_D"), %%xmm0       \n\t" // ? B A ?
2244 +               "shufps $0x99, %%xmm1, %%xmm0           \n\t" // D C B A
2245 +               "mulps 1536+"MANGLE(sseWindow)"(%%"REG_S"), %%xmm0\n\t"
2246 +               "movaps %%xmm0, (%1, %%"REG_S")         \n\t"
2247 +               "add $16, %%"REG_S"                     \n\t"
2248 +               "sub $16, %%"REG_D"                     \n\t"
2249 +               "cmp $512, %%"REG_S"                    \n\t"
2250 +               " jb 1b                                 \n\t"
2251 +               :: "r" (buf), "r" (delay_ptr)
2252 +               : "%"REG_S, "%"REG_D
2253 +       );
2254 +}
2255 +#endif // ARCH_X86 || ARCH_X86_64
2256 +
2257  void a52_imdct_256(sample_t * data, sample_t * delay, sample_t bias)
2258  {
2259      int i, k;
2260 @@ -364,7 +1152,7 @@
2261
2262  void a52_imdct_init (uint32_t mm_accel)
2263  {
2264 -    int i, k;
2265 +    int i, j, k;
2266      double sum;
2267
2268      /* compute imdct window - kaiser-bessel derived window, alpha = 5.0 */
2269 @@ -416,6 +1204,99 @@
2270         post2[i].real = cos ((M_PI / 128) * (i + 0.5));
2271         post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
2272      }
2273 +    for (i = 0; i < 128; i++) {
2274 +       xcos1[i] = -cos ((M_PI / 2048) * (8 * i + 1));
2275 +       xsin1[i] = -sin ((M_PI / 2048) * (8 * i + 1));
2276 +    }
2277 +    for (i = 0; i < 7; i++) {
2278 +       j = 1 << i;
2279 +       for (k = 0; k < j; k++) {
2280 +           w[i][k].real = cos (-M_PI * k / j);
2281 +           w[i][k].imag = sin (-M_PI * k / j);
2282 +       }
2283 +    }
2284 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
2285 +       for (i = 0; i < 128; i++) {
2286 +           sseSinCos1c[2*i+0]= xcos1[i];
2287 +           sseSinCos1c[2*i+1]= -xcos1[i];
2288 +           sseSinCos1d[2*i+0]= xsin1[i];
2289 +           sseSinCos1d[2*i+1]= xsin1[i];
2290 +       }
2291 +       for (i = 1; i < 7; i++) {
2292 +           j = 1 << i;
2293 +           for (k = 0; k < j; k+=2) {
2294 +
2295 +               sseW[i][4*k + 0] = w[i][k+0].real;
2296 +               sseW[i][4*k + 1] = w[i][k+0].real;
2297 +               sseW[i][4*k + 2] = w[i][k+1].real;
2298 +               sseW[i][4*k + 3] = w[i][k+1].real;
2299 +
2300 +               sseW[i][4*k + 4] = -w[i][k+0].imag;
2301 +               sseW[i][4*k + 5] = w[i][k+0].imag;
2302 +               sseW[i][4*k + 6] = -w[i][k+1].imag;
2303 +               sseW[i][4*k + 7] = w[i][k+1].imag;
2304 +
2305 +       //we multiply more or less uninitalized numbers so we need to use exactly 0.0
2306 +               if(k==0)
2307 +               {
2308 +//                     sseW[i][4*k + 0]= sseW[i][4*k + 1]= 1.0;
2309 +                       sseW[i][4*k + 4]= sseW[i][4*k + 5]= 0.0;
2310 +               }
2311 +
2312 +               if(2*k == j)
2313 +               {
2314 +                       sseW[i][4*k + 0]= sseW[i][4*k + 1]= 0.0;
2315 +//                     sseW[i][4*k + 4]= -(sseW[i][4*k + 5]= -1.0);
2316 +               }
2317 +           }
2318 +       }
2319 +
2320 +       for(i=0; i<128; i++)
2321 +       {
2322 +               sseWindow[2*i+0]= -a52_imdct_window[2*i+0];
2323 +               sseWindow[2*i+1]=  a52_imdct_window[2*i+1];
2324 +       }
2325 +
2326 +       for(i=0; i<64; i++)
2327 +       {
2328 +               sseWindow[256 + 2*i+0]= -a52_imdct_window[254 - 2*i+1];
2329 +               sseWindow[256 + 2*i+1]=  a52_imdct_window[254 - 2*i+0];
2330 +               sseWindow[384 + 2*i+0]=  a52_imdct_window[126 - 2*i+1];
2331 +               sseWindow[384 + 2*i+1]= -a52_imdct_window[126 - 2*i+0];
2332 +       }
2333 +#endif
2334 +       a52_imdct_512 = imdct_do_512;
2335 +       ifft128 = ifft128_c;
2336 +       ifft64 = ifft64_c;
2337 +
2338 +#if defined(ARCH_X86) || defined(ARCH_X86_64)
2339 +       if(mm_accel & MM_ACCEL_X86_SSE)
2340 +       {
2341 +         fprintf (stderr, "Using SSE optimized IMDCT transform\n");
2342 +         a52_imdct_512 = imdct_do_512_sse;
2343 +       }
2344 +       else
2345 +       if(mm_accel & MM_ACCEL_X86_3DNOWEXT)
2346 +       {
2347 +         fprintf (stderr, "Using 3DNowEx optimized IMDCT transform\n");
2348 +         a52_imdct_512 = imdct_do_512_3dnowex;
2349 +       }
2350 +       else
2351 +       if(mm_accel & MM_ACCEL_X86_3DNOW)
2352 +       {
2353 +         fprintf (stderr, "Using 3DNow optimized IMDCT transform\n");
2354 +         a52_imdct_512 = imdct_do_512_3dnow;
2355 +       }
2356 +       else
2357 +#endif // ARCH_X86 || ARCH_X86_64
2358 +#ifdef HAVE_ALTIVEC
2359 +        if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
2360 +       {
2361 +         fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
2362 +          a52_imdct_512 = imdct_do_512_altivec;
2363 +       }
2364 +       else
2365 +#endif
2366
2367  #ifdef LIBA52_DJBFFT
2368      if (mm_accel & MM_ACCEL_DJBFFT) {
2369 @@ -426,7 +1307,5 @@
2370  #endif
2371      {
2372         fprintf (stderr, "No accelerated IMDCT transform found\n");
2373 -       ifft128 = ifft128_c;
2374 -       ifft64 = ifft64_c;
2375      }
2376  }
2377 --- include/mm_accel.h  2006-06-12 15:05:00.000000000 +0200
2378 +++ liba52/mm_accel.h   2006-06-05 02:23:04.000000000 +0200
2379 @@ -30,7 +34,12 @@
2380  /* x86 accelerations */
2381  #define MM_ACCEL_X86_MMX       0x80000000
2382  #define MM_ACCEL_X86_3DNOW     0x40000000
2383 +#define MM_ACCEL_X86_3DNOWEXT  0x08000000
2384  #define MM_ACCEL_X86_MMXEXT    0x20000000
2385 +#define MM_ACCEL_X86_SSE       0x10000000
2386 +
2387 +/* PPC accelerations */
2388 +#define MM_ACCEL_PPC_ALTIVEC   0x00010000
2389
2390  uint32_t mm_accel (void);
2391
2392 --- liba52/parse.c      2006-12-05 08:08:01.000000000 +0100
2393 +++ liba52/parse.c      2006-12-05 08:08:44.000000000 +0100
2394 @@ -28,6 +28,7 @@
2395  #include "config.h"
2396
2397  #include <stdlib.h>
2398 +#include <stdio.h>
2399  #include <string.h>
2400  #include <inttypes.h>
2401
2402 @@ -35,13 +36,12 @@
2403  #include "a52_internal.h"
2404  #include "bitstream.h"
2405  #include "tables.h"
2406 +#include "mm_accel.h"
2407 +#include "libavutil/avutil.h"
2408
2409  #ifdef HAVE_MEMALIGN
2410  /* some systems have memalign() but no declaration for it */
2411  void * memalign (size_t align, size_t size);
2412 -#else
2413 -/* assume malloc alignment is sufficient */
2414 -#define memalign(align,size) malloc (size)
2415  #endif
2416
2417  typedef struct {
2418 @@ -64,7 +64,16 @@
2419      if (state == NULL)
2420         return NULL;
2421
2422 +#if defined(__MINGW32__) && defined(HAVE_SSE)
2423 +    state->samples = av_malloc(256 * 12 * sizeof (sample_t));
2424 +#else
2425      state->samples = memalign (16, 256 * 12 * sizeof (sample_t));
2426 +#endif
2427 +    if(((int)state->samples%16) && (mm_accel&MM_ACCEL_X86_SSE)){
2428 +      mm_accel &=~MM_ACCEL_X86_SSE;
2429 +      fprintf(stderr, "liba52: unable to get 16 byte aligned memory disabling usage of SSE instructions\n");
2430 +    }
2431 +
2432      if (state->samples == NULL) {
2433         free (state);
2434         return NULL;
2435 @@ -78,6 +87,7 @@
2436      state->lfsr_state = 1;
2437
2438      a52_imdct_init (mm_accel);
2439 +    downmix_accel_init(mm_accel);
2440
2441      return state;
2442  }
2443 @@ -145,7 +155,7 @@
2444      state->acmod = acmod = buf[6] >> 5;
2445
2446      a52_bitstream_set_ptr (state, buf + 6);
2447 -    bitstream_get (state, 3);  /* skip acmod we already parsed */
2448 +    bitstream_skip (state, 3); /* skip acmod we already parsed */
2449
2450      if ((acmod == 2) && (bitstream_get (state, 2) == 2))       /* dsurmod */
2451         acmod = A52_DOLBY;
2452 @@ -176,28 +186,28 @@
2453
2454      chaninfo = !acmod;
2455      do {
2456 -       bitstream_get (state, 5);       /* dialnorm */
2457 +       bitstream_skip (state, 5);      /* dialnorm */
2458         if (bitstream_get (state, 1))   /* compre */
2459 -           bitstream_get (state, 8);   /* compr */
2460 +           bitstream_skip (state, 8);  /* compr */
2461         if (bitstream_get (state, 1))   /* langcode */
2462 -           bitstream_get (state, 8);   /* langcod */
2463 +           bitstream_skip (state, 8);  /* langcod */
2464         if (bitstream_get (state, 1))   /* audprodie */
2465 -           bitstream_get (state, 7);   /* mixlevel + roomtyp */
2466 +           bitstream_skip (state, 7);  /* mixlevel + roomtyp */
2467      } while (chaninfo--);
2468
2469 -    bitstream_get (state, 2);          /* copyrightb + origbs */
2470 +    bitstream_skip (state, 2);         /* copyrightb + origbs */
2471
2472      if (bitstream_get (state, 1))      /* timecod1e */
2473 -       bitstream_get (state, 14);      /* timecod1 */
2474 +       bitstream_skip (state, 14);     /* timecod1 */
2475      if (bitstream_get (state, 1))      /* timecod2e */
2476 -       bitstream_get (state, 14);      /* timecod2 */
2477 +       bitstream_skip (state, 14);     /* timecod2 */
2478
2479      if (bitstream_get (state, 1)) {    /* addbsie */
2480         int addbsil;
2481
2482         addbsil = bitstream_get (state, 6);
2483         do {
2484 -           bitstream_get (state, 8);   /* addbsi */
2485 +           bitstream_skip (state, 8);  /* addbsi */
2486         } while (addbsil--);
2487      }
2488
2489 @@ -684,7 +694,7 @@
2490                                  state->fbw_expbap[i].exp[0],
2491                                  state->fbw_expbap[i].exp + 1))
2492                 return 1;
2493 -           bitstream_get (state, 2);   /* gainrng */
2494 +           bitstream_skip (state, 2);  /* gainrng */
2495         }
2496      if (lfeexpstr != EXP_REUSE) {
2497         do_bit_alloc |= 32;
2498 @@ -759,7 +769,7 @@
2499      if (bitstream_get (state, 1)) {    /* skiple */
2500         i = bitstream_get (state, 9);   /* skipl */
2501         while (i--)
2502 -           bitstream_get (state, 8);
2503 +           bitstream_skip (state, 8);
2504      }
2505
2506      samples = state->samples;
2507 @@ -900,6 +910,10 @@
2508
2509  void a52_free (a52_state_t * state)
2510  {
2511 -    free (state->samples);
2512 +#if defined(__MINGW32__) && defined(HAVE_SSE)
2513 +    av_free (state->samples);
2514 +#else
2515 +     free (state->samples);
2516 +#endif
2517      free (state);
2518  }