3 * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
10 * detailed changelog at http://svn.mplayerhq.hu/mplayer/trunk/
13 * a52dec is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * a52dec is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
36 #include "a52_internal.h"
39 #define CONVERT(acmod,output) (((output) << 3) + (acmod))
42 void (*a52_downmix
)(sample_t
* samples
, int acmod
, int output
, sample_t bias
,
43 sample_t clev
, sample_t slev
)= NULL
;
44 void (*a52_upmix
)(sample_t
* samples
, int acmod
, int output
)= NULL
;
46 static void downmix_SSE (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
47 sample_t clev
, sample_t slev
);
48 static void downmix_3dnow (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
49 sample_t clev
, sample_t slev
);
50 static void downmix_C (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
51 sample_t clev
, sample_t slev
);
52 static void upmix_MMX (sample_t
* samples
, int acmod
, int output
);
53 static void upmix_C (sample_t
* samples
, int acmod
, int output
);
55 void downmix_accel_init(uint32_t mm_accel
)
58 a52_downmix
= downmix_C
;
59 #if ARCH_X86 || ARCH_X86_64
60 if(mm_accel
& MM_ACCEL_X86_MMX
) a52_upmix
= upmix_MMX
;
61 if(mm_accel
& MM_ACCEL_X86_SSE
) a52_downmix
= downmix_SSE
;
62 if(mm_accel
& MM_ACCEL_X86_3DNOW
) a52_downmix
= downmix_3dnow
;
66 int a52_downmix_init (int input
, int flags
, sample_t
* level
,
67 sample_t clev
, sample_t slev
)
69 static uint8_t table
[11][8] = {
70 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
71 A52_STEREO
, A52_STEREO
, A52_STEREO
, A52_STEREO
},
72 {A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
,
73 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
74 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
75 A52_STEREO
, A52_STEREO
, A52_STEREO
, A52_STEREO
},
76 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
77 A52_STEREO
, A52_3F
, A52_STEREO
, A52_3F
},
78 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
79 A52_2F1R
, A52_2F1R
, A52_2F1R
, A52_2F1R
},
80 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
81 A52_2F1R
, A52_3F1R
, A52_2F1R
, A52_3F1R
},
82 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
83 A52_2F2R
, A52_2F2R
, A52_2F2R
, A52_2F2R
},
84 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
85 A52_2F2R
, A52_3F2R
, A52_2F2R
, A52_3F2R
},
86 {A52_CHANNEL1
, A52_MONO
, A52_MONO
, A52_MONO
,
87 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
88 {A52_CHANNEL2
, A52_MONO
, A52_MONO
, A52_MONO
,
89 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
90 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_DOLBY
,
91 A52_DOLBY
, A52_DOLBY
, A52_DOLBY
, A52_DOLBY
}
95 output
= flags
& A52_CHANNEL_MASK
;
96 if (output
> A52_DOLBY
)
99 output
= table
[output
][input
& 7];
101 if ((output
== A52_STEREO
) &&
102 ((input
== A52_DOLBY
) || ((input
== A52_3F
) && (clev
== LEVEL_3DB
))))
105 if (flags
& A52_ADJUST_LEVEL
)
106 switch (CONVERT (input
& 7, output
)) {
108 case CONVERT (A52_3F
, A52_MONO
):
109 *level
*= LEVEL_3DB
/ (1 + clev
);
112 case CONVERT (A52_STEREO
, A52_MONO
):
113 case CONVERT (A52_2F2R
, A52_2F1R
):
114 case CONVERT (A52_3F2R
, A52_3F1R
):
119 case CONVERT (A52_3F2R
, A52_2F1R
):
120 if (clev
< LEVEL_PLUS3DB
- 1)
123 case CONVERT (A52_3F
, A52_STEREO
):
124 case CONVERT (A52_3F1R
, A52_2F1R
):
125 case CONVERT (A52_3F1R
, A52_2F2R
):
126 case CONVERT (A52_3F2R
, A52_2F2R
):
130 case CONVERT (A52_2F1R
, A52_MONO
):
131 *level
*= LEVEL_PLUS3DB
/ (2 + slev
);
134 case CONVERT (A52_2F1R
, A52_STEREO
):
135 case CONVERT (A52_3F1R
, A52_3F
):
136 *level
/= 1 + slev
* LEVEL_3DB
;
139 case CONVERT (A52_3F1R
, A52_MONO
):
140 *level
*= LEVEL_3DB
/ (1 + clev
+ 0.5 * slev
);
143 case CONVERT (A52_3F1R
, A52_STEREO
):
144 *level
/= 1 + clev
+ slev
* LEVEL_3DB
;
147 case CONVERT (A52_2F2R
, A52_MONO
):
148 *level
*= LEVEL_3DB
/ (1 + slev
);
151 case CONVERT (A52_2F2R
, A52_STEREO
):
152 case CONVERT (A52_3F2R
, A52_3F
):
156 case CONVERT (A52_3F2R
, A52_MONO
):
157 *level
*= LEVEL_3DB
/ (1 + clev
+ slev
);
160 case CONVERT (A52_3F2R
, A52_STEREO
):
161 *level
/= 1 + clev
+ slev
;
164 case CONVERT (A52_MONO
, A52_DOLBY
):
165 *level
*= LEVEL_PLUS3DB
;
168 case CONVERT (A52_3F
, A52_DOLBY
):
169 case CONVERT (A52_2F1R
, A52_DOLBY
):
170 *level
*= 1 / (1 + LEVEL_3DB
);
173 case CONVERT (A52_3F1R
, A52_DOLBY
):
174 case CONVERT (A52_2F2R
, A52_DOLBY
):
175 *level
*= 1 / (1 + 2 * LEVEL_3DB
);
178 case CONVERT (A52_3F2R
, A52_DOLBY
):
179 *level
*= 1 / (1 + 3 * LEVEL_3DB
);
186 int a52_downmix_coeff (sample_t
* coeff
, int acmod
, int output
, sample_t level
,
187 sample_t clev
, sample_t slev
)
189 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
191 case CONVERT (A52_CHANNEL
, A52_CHANNEL
):
192 case CONVERT (A52_MONO
, A52_MONO
):
193 case CONVERT (A52_STEREO
, A52_STEREO
):
194 case CONVERT (A52_3F
, A52_3F
):
195 case CONVERT (A52_2F1R
, A52_2F1R
):
196 case CONVERT (A52_3F1R
, A52_3F1R
):
197 case CONVERT (A52_2F2R
, A52_2F2R
):
198 case CONVERT (A52_3F2R
, A52_3F2R
):
199 case CONVERT (A52_STEREO
, A52_DOLBY
):
200 coeff
[0] = coeff
[1] = coeff
[2] = coeff
[3] = coeff
[4] = level
;
203 case CONVERT (A52_CHANNEL
, A52_MONO
):
204 coeff
[0] = coeff
[1] = level
* LEVEL_6DB
;
207 case CONVERT (A52_STEREO
, A52_MONO
):
208 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
211 case CONVERT (A52_3F
, A52_MONO
):
212 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
213 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
216 case CONVERT (A52_2F1R
, A52_MONO
):
217 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
218 coeff
[2] = level
* slev
* LEVEL_3DB
;
221 case CONVERT (A52_2F2R
, A52_MONO
):
222 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
223 coeff
[2] = coeff
[3] = level
* slev
* LEVEL_3DB
;
226 case CONVERT (A52_3F1R
, A52_MONO
):
227 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
228 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
229 coeff
[3] = level
* slev
* LEVEL_3DB
;
232 case CONVERT (A52_3F2R
, A52_MONO
):
233 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
234 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
235 coeff
[3] = coeff
[4] = level
* slev
* LEVEL_3DB
;
238 case CONVERT (A52_MONO
, A52_DOLBY
):
239 coeff
[0] = level
* LEVEL_3DB
;
242 case CONVERT (A52_3F
, A52_DOLBY
):
244 case CONVERT (A52_3F
, A52_STEREO
):
245 case CONVERT (A52_3F1R
, A52_2F1R
):
246 case CONVERT (A52_3F2R
, A52_2F2R
):
247 coeff
[0] = coeff
[2] = coeff
[3] = coeff
[4] = level
;
248 coeff
[1] = level
* clev
;
251 case CONVERT (A52_2F1R
, A52_DOLBY
):
253 case CONVERT (A52_2F1R
, A52_STEREO
):
254 coeff
[0] = coeff
[1] = level
;
255 coeff
[2] = level
* slev
* LEVEL_3DB
;
258 case CONVERT (A52_3F1R
, A52_DOLBY
):
261 case CONVERT (A52_3F1R
, A52_STEREO
):
262 coeff
[0] = coeff
[2] = level
;
263 coeff
[1] = level
* clev
;
264 coeff
[3] = level
* slev
* LEVEL_3DB
;
267 case CONVERT (A52_2F2R
, A52_DOLBY
):
269 case CONVERT (A52_2F2R
, A52_STEREO
):
270 coeff
[0] = coeff
[1] = level
;
271 coeff
[2] = coeff
[3] = level
* slev
;
274 case CONVERT (A52_3F2R
, A52_DOLBY
):
276 case CONVERT (A52_3F2R
, A52_2F1R
):
278 case CONVERT (A52_3F2R
, A52_STEREO
):
279 coeff
[0] = coeff
[2] = level
;
280 coeff
[1] = level
* clev
;
281 coeff
[3] = coeff
[4] = level
* slev
;
284 case CONVERT (A52_3F1R
, A52_3F
):
285 coeff
[0] = coeff
[1] = coeff
[2] = level
;
286 coeff
[3] = level
* slev
* LEVEL_3DB
;
289 case CONVERT (A52_3F2R
, A52_3F
):
290 coeff
[0] = coeff
[1] = coeff
[2] = level
;
291 coeff
[3] = coeff
[4] = level
* slev
;
294 case CONVERT (A52_2F2R
, A52_2F1R
):
295 coeff
[0] = coeff
[1] = level
;
296 coeff
[2] = coeff
[3] = level
* LEVEL_3DB
;
299 case CONVERT (A52_3F2R
, A52_3F1R
):
300 coeff
[0] = coeff
[1] = coeff
[2] = level
;
301 coeff
[3] = coeff
[4] = level
* LEVEL_3DB
;
304 case CONVERT (A52_2F1R
, A52_2F2R
):
305 coeff
[0] = coeff
[1] = level
;
306 coeff
[2] = level
* LEVEL_3DB
;
309 case CONVERT (A52_3F1R
, A52_2F2R
):
310 coeff
[0] = coeff
[2] = level
;
311 coeff
[1] = level
* clev
;
312 coeff
[3] = level
* LEVEL_3DB
;
315 case CONVERT (A52_3F1R
, A52_3F2R
):
316 coeff
[0] = coeff
[1] = coeff
[2] = level
;
317 coeff
[3] = level
* LEVEL_3DB
;
320 case CONVERT (A52_CHANNEL
, A52_CHANNEL1
):
325 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
331 return -1; /* NOTREACHED */
334 static void mix2to1 (sample_t
* dest
, sample_t
* src
, sample_t bias
)
338 for (i
= 0; i
< 256; i
++)
339 dest
[i
] += src
[i
] + bias
;
342 static void mix3to1 (sample_t
* samples
, sample_t bias
)
346 for (i
= 0; i
< 256; i
++)
347 samples
[i
] += samples
[i
+ 256] + samples
[i
+ 512] + bias
;
350 static void mix4to1 (sample_t
* samples
, sample_t bias
)
354 for (i
= 0; i
< 256; i
++)
355 samples
[i
] += (samples
[i
+ 256] + samples
[i
+ 512] +
356 samples
[i
+ 768] + bias
);
359 static void mix5to1 (sample_t
* samples
, sample_t bias
)
363 for (i
= 0; i
< 256; i
++)
364 samples
[i
] += (samples
[i
+ 256] + samples
[i
+ 512] +
365 samples
[i
+ 768] + samples
[i
+ 1024] + bias
);
368 static void mix3to2 (sample_t
* samples
, sample_t bias
)
373 for (i
= 0; i
< 256; i
++) {
374 common
= samples
[i
+ 256] + bias
;
375 samples
[i
] += common
;
376 samples
[i
+ 256] = samples
[i
+ 512] + common
;
380 static void mix21to2 (sample_t
* left
, sample_t
* right
, sample_t bias
)
385 for (i
= 0; i
< 256; i
++) {
386 common
= right
[i
+ 256] + bias
;
392 static void mix21toS (sample_t
* samples
, sample_t bias
)
397 for (i
= 0; i
< 256; i
++) {
398 surround
= samples
[i
+ 512];
399 samples
[i
] += bias
- surround
;
400 samples
[i
+ 256] += bias
+ surround
;
404 static void mix31to2 (sample_t
* samples
, sample_t bias
)
409 for (i
= 0; i
< 256; i
++) {
410 common
= samples
[i
+ 256] + samples
[i
+ 768] + bias
;
411 samples
[i
] += common
;
412 samples
[i
+ 256] = samples
[i
+ 512] + common
;
416 static void mix31toS (sample_t
* samples
, sample_t bias
)
419 sample_t common
, surround
;
421 for (i
= 0; i
< 256; i
++) {
422 common
= samples
[i
+ 256] + bias
;
423 surround
= samples
[i
+ 768];
424 samples
[i
] += common
- surround
;
425 samples
[i
+ 256] = samples
[i
+ 512] + common
+ surround
;
429 static void mix22toS (sample_t
* samples
, sample_t bias
)
434 for (i
= 0; i
< 256; i
++) {
435 surround
= samples
[i
+ 512] + samples
[i
+ 768];
436 samples
[i
] += bias
- surround
;
437 samples
[i
+ 256] += bias
+ surround
;
441 static void mix32to2 (sample_t
* samples
, sample_t bias
)
446 for (i
= 0; i
< 256; i
++) {
447 common
= samples
[i
+ 256] + bias
;
448 samples
[i
] += common
+ samples
[i
+ 768];
449 samples
[i
+ 256] = common
+ samples
[i
+ 512] + samples
[i
+ 1024];
453 static void mix32toS (sample_t
* samples
, sample_t bias
)
456 sample_t common
, surround
;
458 for (i
= 0; i
< 256; i
++) {
459 common
= samples
[i
+ 256] + bias
;
460 surround
= samples
[i
+ 768] + samples
[i
+ 1024];
461 samples
[i
] += common
- surround
;
462 samples
[i
+ 256] = samples
[i
+ 512] + common
+ surround
;
466 static void move2to1 (sample_t
* src
, sample_t
* dest
, sample_t bias
)
470 for (i
= 0; i
< 256; i
++)
471 dest
[i
] = src
[i
] + src
[i
+ 256] + bias
;
474 static void zero (sample_t
* samples
)
478 for (i
= 0; i
< 256; i
++)
482 void downmix_C (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
483 sample_t clev
, sample_t slev
)
485 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
487 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
488 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
491 case CONVERT (A52_CHANNEL
, A52_MONO
):
492 case CONVERT (A52_STEREO
, A52_MONO
):
494 mix2to1 (samples
, samples
+ 256, bias
);
497 case CONVERT (A52_2F1R
, A52_MONO
):
500 case CONVERT (A52_3F
, A52_MONO
):
502 mix3to1 (samples
, bias
);
505 case CONVERT (A52_3F1R
, A52_MONO
):
508 case CONVERT (A52_2F2R
, A52_MONO
):
511 mix4to1 (samples
, bias
);
514 case CONVERT (A52_3F2R
, A52_MONO
):
517 mix5to1 (samples
, bias
);
520 case CONVERT (A52_MONO
, A52_DOLBY
):
521 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
524 case CONVERT (A52_3F
, A52_STEREO
):
525 case CONVERT (A52_3F
, A52_DOLBY
):
527 mix3to2 (samples
, bias
);
530 case CONVERT (A52_2F1R
, A52_STEREO
):
533 mix21to2 (samples
, samples
+ 256, bias
);
536 case CONVERT (A52_2F1R
, A52_DOLBY
):
537 mix21toS (samples
, bias
);
540 case CONVERT (A52_3F1R
, A52_STEREO
):
543 mix31to2 (samples
, bias
);
546 case CONVERT (A52_3F1R
, A52_DOLBY
):
547 mix31toS (samples
, bias
);
550 case CONVERT (A52_2F2R
, A52_STEREO
):
553 mix2to1 (samples
, samples
+ 512, bias
);
554 mix2to1 (samples
+ 256, samples
+ 768, bias
);
557 case CONVERT (A52_2F2R
, A52_DOLBY
):
558 mix22toS (samples
, bias
);
561 case CONVERT (A52_3F2R
, A52_STEREO
):
564 mix32to2 (samples
, bias
);
567 case CONVERT (A52_3F2R
, A52_DOLBY
):
568 mix32toS (samples
, bias
);
571 case CONVERT (A52_3F1R
, A52_3F
):
574 mix21to2 (samples
, samples
+ 512, bias
);
577 case CONVERT (A52_3F2R
, A52_3F
):
580 mix2to1 (samples
, samples
+ 768, bias
);
581 mix2to1 (samples
+ 512, samples
+ 1024, bias
);
584 case CONVERT (A52_3F1R
, A52_2F1R
):
585 mix3to2 (samples
, bias
);
586 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
589 case CONVERT (A52_2F2R
, A52_2F1R
):
590 mix2to1 (samples
+ 512, samples
+ 768, bias
);
593 case CONVERT (A52_3F2R
, A52_2F1R
):
594 mix3to2 (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
595 move2to1 (samples
+ 768, samples
+ 512, bias
);
598 case CONVERT (A52_3F2R
, A52_3F1R
):
599 mix2to1 (samples
+ 768, samples
+ 1024, bias
);
602 case CONVERT (A52_2F1R
, A52_2F2R
):
603 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
606 case CONVERT (A52_3F1R
, A52_2F2R
):
607 mix3to2 (samples
, bias
);
608 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
611 case CONVERT (A52_3F2R
, A52_2F2R
):
612 mix3to2 (samples
, bias
);
613 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
614 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
617 case CONVERT (A52_3F1R
, A52_3F2R
):
618 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
623 void upmix_C (sample_t
* samples
, int acmod
, int output
)
625 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
627 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
628 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
631 case CONVERT (A52_3F2R
, A52_MONO
):
632 zero (samples
+ 1024);
633 case CONVERT (A52_3F1R
, A52_MONO
):
634 case CONVERT (A52_2F2R
, A52_MONO
):
635 zero (samples
+ 768);
636 case CONVERT (A52_3F
, A52_MONO
):
637 case CONVERT (A52_2F1R
, A52_MONO
):
638 zero (samples
+ 512);
639 case CONVERT (A52_CHANNEL
, A52_MONO
):
640 case CONVERT (A52_STEREO
, A52_MONO
):
641 zero (samples
+ 256);
644 case CONVERT (A52_3F2R
, A52_STEREO
):
645 case CONVERT (A52_3F2R
, A52_DOLBY
):
646 zero (samples
+ 1024);
647 case CONVERT (A52_3F1R
, A52_STEREO
):
648 case CONVERT (A52_3F1R
, A52_DOLBY
):
649 zero (samples
+ 768);
650 case CONVERT (A52_3F
, A52_STEREO
):
651 case CONVERT (A52_3F
, A52_DOLBY
):
653 memcpy (samples
+ 512, samples
+ 256, 256 * sizeof (sample_t
));
654 zero (samples
+ 256);
657 case CONVERT (A52_2F2R
, A52_STEREO
):
658 case CONVERT (A52_2F2R
, A52_DOLBY
):
659 zero (samples
+ 768);
660 case CONVERT (A52_2F1R
, A52_STEREO
):
661 case CONVERT (A52_2F1R
, A52_DOLBY
):
662 zero (samples
+ 512);
665 case CONVERT (A52_3F2R
, A52_3F
):
666 zero (samples
+ 1024);
667 case CONVERT (A52_3F1R
, A52_3F
):
668 case CONVERT (A52_2F2R
, A52_2F1R
):
669 zero (samples
+ 768);
672 case CONVERT (A52_3F2R
, A52_3F1R
):
673 zero (samples
+ 1024);
676 case CONVERT (A52_3F2R
, A52_2F1R
):
677 zero (samples
+ 1024);
678 case CONVERT (A52_3F1R
, A52_2F1R
):
680 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
683 case CONVERT (A52_3F2R
, A52_2F2R
):
684 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
689 #if ARCH_X86 || ARCH_X86_64
690 static void mix2to1_SSE (sample_t
* dest
, sample_t
* src
, sample_t bias
)
693 "movlps %2, %%xmm7 \n\t"
694 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
695 "mov $-1024, %%"REG_S
" \n\t"
698 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
699 "movaps 16(%0, %%"REG_S
"), %%xmm1\n\t"
700 "addps (%1, %%"REG_S
"), %%xmm0 \n\t"
701 "addps 16(%1, %%"REG_S
"), %%xmm1\n\t"
702 "addps %%xmm7, %%xmm0 \n\t"
703 "addps %%xmm7, %%xmm1 \n\t"
704 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
705 "movaps %%xmm1, 16(%1, %%"REG_S
")\n\t"
706 "add $32, %%"REG_S
" \n\t"
708 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
713 static void mix3to1_SSE (sample_t
* samples
, sample_t bias
)
716 "movlps %1, %%xmm7 \n\t"
717 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
718 "mov $-1024, %%"REG_S
" \n\t"
721 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
722 "movaps 1024(%0, %%"REG_S
"), %%xmm1\n\t"
723 "addps 2048(%0, %%"REG_S
"), %%xmm0\n\t"
724 "addps %%xmm7, %%xmm1 \n\t"
725 "addps %%xmm1, %%xmm0 \n\t"
726 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
727 "add $16, %%"REG_S
" \n\t"
729 :: "r" (samples
+256), "m" (bias
)
734 static void mix4to1_SSE (sample_t
* samples
, sample_t bias
)
737 "movlps %1, %%xmm7 \n\t"
738 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
739 "mov $-1024, %%"REG_S
" \n\t"
742 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
743 "movaps 1024(%0, %%"REG_S
"), %%xmm1\n\t"
744 "addps 2048(%0, %%"REG_S
"), %%xmm0\n\t"
745 "addps 3072(%0, %%"REG_S
"), %%xmm1\n\t"
746 "addps %%xmm7, %%xmm0 \n\t"
747 "addps %%xmm1, %%xmm0 \n\t"
748 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
749 "add $16, %%"REG_S
" \n\t"
751 :: "r" (samples
+256), "m" (bias
)
756 static void mix5to1_SSE (sample_t
* samples
, sample_t bias
)
759 "movlps %1, %%xmm7 \n\t"
760 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
761 "mov $-1024, %%"REG_S
" \n\t"
764 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
765 "movaps 1024(%0, %%"REG_S
"), %%xmm1\n\t"
766 "addps 2048(%0, %%"REG_S
"), %%xmm0\n\t"
767 "addps 3072(%0, %%"REG_S
"), %%xmm1\n\t"
768 "addps %%xmm7, %%xmm0 \n\t"
769 "addps 4096(%0, %%"REG_S
"), %%xmm1\n\t"
770 "addps %%xmm1, %%xmm0 \n\t"
771 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
772 "add $16, %%"REG_S
" \n\t"
774 :: "r" (samples
+256), "m" (bias
)
779 static void mix3to2_SSE (sample_t
* samples
, sample_t bias
)
782 "movlps %1, %%xmm7 \n\t"
783 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
784 "mov $-1024, %%"REG_S
" \n\t"
787 "movaps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
788 "addps %%xmm7, %%xmm0 \n\t" //common
789 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
790 "movaps 2048(%0, %%"REG_S
"), %%xmm2\n\t"
791 "addps %%xmm0, %%xmm1 \n\t"
792 "addps %%xmm0, %%xmm2 \n\t"
793 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
794 "movaps %%xmm2, 1024(%0, %%"REG_S
")\n\t"
795 "add $16, %%"REG_S
" \n\t"
797 :: "r" (samples
+256), "m" (bias
)
802 static void mix21to2_SSE (sample_t
* left
, sample_t
* right
, sample_t bias
)
805 "movlps %2, %%xmm7 \n\t"
806 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
807 "mov $-1024, %%"REG_S
" \n\t"
810 "movaps 1024(%1, %%"REG_S
"), %%xmm0\n\t"
811 "addps %%xmm7, %%xmm0 \n\t" //common
812 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
813 "movaps (%1, %%"REG_S
"), %%xmm2 \n\t"
814 "addps %%xmm0, %%xmm1 \n\t"
815 "addps %%xmm0, %%xmm2 \n\t"
816 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
817 "movaps %%xmm2, (%1, %%"REG_S
") \n\t"
818 "add $16, %%"REG_S
" \n\t"
820 :: "r" (left
+256), "r" (right
+256), "m" (bias
)
825 static void mix21toS_SSE (sample_t
* samples
, sample_t bias
)
828 "movlps %1, %%xmm7 \n\t"
829 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
830 "mov $-1024, %%"REG_S
" \n\t"
833 "movaps 2048(%0, %%"REG_S
"), %%xmm0\n\t" // surround
834 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
835 "movaps 1024(%0, %%"REG_S
"), %%xmm2\n\t"
836 "addps %%xmm7, %%xmm1 \n\t"
837 "addps %%xmm7, %%xmm2 \n\t"
838 "subps %%xmm0, %%xmm1 \n\t"
839 "addps %%xmm0, %%xmm2 \n\t"
840 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
841 "movaps %%xmm2, 1024(%0, %%"REG_S
")\n\t"
842 "add $16, %%"REG_S
" \n\t"
844 :: "r" (samples
+256), "m" (bias
)
849 static void mix31to2_SSE (sample_t
* samples
, sample_t bias
)
852 "movlps %1, %%xmm7 \n\t"
853 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
854 "mov $-1024, %%"REG_S
" \n\t"
857 "movaps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
858 "addps 3072(%0, %%"REG_S
"), %%xmm0\n\t"
859 "addps %%xmm7, %%xmm0 \n\t" // common
860 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
861 "movaps 2048(%0, %%"REG_S
"), %%xmm2\n\t"
862 "addps %%xmm0, %%xmm1 \n\t"
863 "addps %%xmm0, %%xmm2 \n\t"
864 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
865 "movaps %%xmm2, 1024(%0, %%"REG_S
")\n\t"
866 "add $16, %%"REG_S
" \n\t"
868 :: "r" (samples
+256), "m" (bias
)
873 static void mix31toS_SSE (sample_t
* samples
, sample_t bias
)
876 "movlps %1, %%xmm7 \n\t"
877 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
878 "mov $-1024, %%"REG_S
" \n\t"
881 "movaps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
882 "movaps 3072(%0, %%"REG_S
"), %%xmm3\n\t" // surround
883 "addps %%xmm7, %%xmm0 \n\t" // common
884 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
885 "movaps 2048(%0, %%"REG_S
"), %%xmm2\n\t"
886 "addps %%xmm0, %%xmm1 \n\t"
887 "addps %%xmm0, %%xmm2 \n\t"
888 "subps %%xmm3, %%xmm1 \n\t"
889 "addps %%xmm3, %%xmm2 \n\t"
890 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
891 "movaps %%xmm2, 1024(%0, %%"REG_S
")\n\t"
892 "add $16, %%"REG_S
" \n\t"
894 :: "r" (samples
+256), "m" (bias
)
899 static void mix22toS_SSE (sample_t
* samples
, sample_t bias
)
902 "movlps %1, %%xmm7 \n\t"
903 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
904 "mov $-1024, %%"REG_S
" \n\t"
907 "movaps 2048(%0, %%"REG_S
"), %%xmm0\n\t"
908 "addps 3072(%0, %%"REG_S
"), %%xmm0\n\t" // surround
909 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
910 "movaps 1024(%0, %%"REG_S
"), %%xmm2\n\t"
911 "addps %%xmm7, %%xmm1 \n\t"
912 "addps %%xmm7, %%xmm2 \n\t"
913 "subps %%xmm0, %%xmm1 \n\t"
914 "addps %%xmm0, %%xmm2 \n\t"
915 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
916 "movaps %%xmm2, 1024(%0, %%"REG_S
")\n\t"
917 "add $16, %%"REG_S
" \n\t"
919 :: "r" (samples
+256), "m" (bias
)
924 static void mix32to2_SSE (sample_t
* samples
, sample_t bias
)
927 "movlps %1, %%xmm7 \n\t"
928 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
929 "mov $-1024, %%"REG_S
" \n\t"
932 "movaps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
933 "addps %%xmm7, %%xmm0 \n\t" // common
934 "movaps %%xmm0, %%xmm1 \n\t" // common
935 "addps (%0, %%"REG_S
"), %%xmm0 \n\t"
936 "addps 2048(%0, %%"REG_S
"), %%xmm1\n\t"
937 "addps 3072(%0, %%"REG_S
"), %%xmm0\n\t"
938 "addps 4096(%0, %%"REG_S
"), %%xmm1\n\t"
939 "movaps %%xmm0, (%0, %%"REG_S
") \n\t"
940 "movaps %%xmm1, 1024(%0, %%"REG_S
")\n\t"
941 "add $16, %%"REG_S
" \n\t"
943 :: "r" (samples
+256), "m" (bias
)
948 static void mix32toS_SSE (sample_t
* samples
, sample_t bias
)
951 "movlps %1, %%xmm7 \n\t"
952 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
953 "mov $-1024, %%"REG_S
" \n\t"
956 "movaps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
957 "movaps 3072(%0, %%"REG_S
"), %%xmm2\n\t"
958 "addps %%xmm7, %%xmm0 \n\t" // common
959 "addps 4096(%0, %%"REG_S
"), %%xmm2\n\t" // surround
960 "movaps (%0, %%"REG_S
"), %%xmm1 \n\t"
961 "movaps 2048(%0, %%"REG_S
"), %%xmm3\n\t"
962 "subps %%xmm2, %%xmm1 \n\t"
963 "addps %%xmm2, %%xmm3 \n\t"
964 "addps %%xmm0, %%xmm1 \n\t"
965 "addps %%xmm0, %%xmm3 \n\t"
966 "movaps %%xmm1, (%0, %%"REG_S
") \n\t"
967 "movaps %%xmm3, 1024(%0, %%"REG_S
")\n\t"
968 "add $16, %%"REG_S
" \n\t"
970 :: "r" (samples
+256), "m" (bias
)
975 static void move2to1_SSE (sample_t
* src
, sample_t
* dest
, sample_t bias
)
978 "movlps %2, %%xmm7 \n\t"
979 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
980 "mov $-1024, %%"REG_S
" \n\t"
983 "movaps (%0, %%"REG_S
"), %%xmm0 \n\t"
984 "movaps 16(%0, %%"REG_S
"), %%xmm1\n\t"
985 "addps 1024(%0, %%"REG_S
"), %%xmm0\n\t"
986 "addps 1040(%0, %%"REG_S
"), %%xmm1\n\t"
987 "addps %%xmm7, %%xmm0 \n\t"
988 "addps %%xmm7, %%xmm1 \n\t"
989 "movaps %%xmm0, (%1, %%"REG_S
") \n\t"
990 "movaps %%xmm1, 16(%1, %%"REG_S
")\n\t"
991 "add $32, %%"REG_S
" \n\t"
993 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
998 static void zero_MMX(sample_t
* samples
)
1001 "mov $-1024, %%"REG_S
" \n\t"
1002 "pxor %%mm0, %%mm0 \n\t"
1005 "movq %%mm0, (%0, %%"REG_S
") \n\t"
1006 "movq %%mm0, 8(%0, %%"REG_S
") \n\t"
1007 "movq %%mm0, 16(%0, %%"REG_S
") \n\t"
1008 "movq %%mm0, 24(%0, %%"REG_S
") \n\t"
1009 "add $32, %%"REG_S
" \n\t"
1012 :: "r" (samples
+256)
1017 static void downmix_SSE (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
1018 sample_t clev
, sample_t slev
)
1020 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1022 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1023 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
1026 case CONVERT (A52_CHANNEL
, A52_MONO
):
1027 case CONVERT (A52_STEREO
, A52_MONO
):
1029 mix2to1_SSE (samples
, samples
+ 256, bias
);
1032 case CONVERT (A52_2F1R
, A52_MONO
):
1035 case CONVERT (A52_3F
, A52_MONO
):
1037 mix3to1_SSE (samples
, bias
);
1040 case CONVERT (A52_3F1R
, A52_MONO
):
1043 case CONVERT (A52_2F2R
, A52_MONO
):
1046 mix4to1_SSE (samples
, bias
);
1049 case CONVERT (A52_3F2R
, A52_MONO
):
1052 mix5to1_SSE (samples
, bias
);
1055 case CONVERT (A52_MONO
, A52_DOLBY
):
1056 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1059 case CONVERT (A52_3F
, A52_STEREO
):
1060 case CONVERT (A52_3F
, A52_DOLBY
):
1062 mix3to2_SSE (samples
, bias
);
1065 case CONVERT (A52_2F1R
, A52_STEREO
):
1068 mix21to2_SSE (samples
, samples
+ 256, bias
);
1071 case CONVERT (A52_2F1R
, A52_DOLBY
):
1072 mix21toS_SSE (samples
, bias
);
1075 case CONVERT (A52_3F1R
, A52_STEREO
):
1078 mix31to2_SSE (samples
, bias
);
1081 case CONVERT (A52_3F1R
, A52_DOLBY
):
1082 mix31toS_SSE (samples
, bias
);
1085 case CONVERT (A52_2F2R
, A52_STEREO
):
1088 mix2to1_SSE (samples
, samples
+ 512, bias
);
1089 mix2to1_SSE (samples
+ 256, samples
+ 768, bias
);
1092 case CONVERT (A52_2F2R
, A52_DOLBY
):
1093 mix22toS_SSE (samples
, bias
);
1096 case CONVERT (A52_3F2R
, A52_STEREO
):
1099 mix32to2_SSE (samples
, bias
);
1102 case CONVERT (A52_3F2R
, A52_DOLBY
):
1103 mix32toS_SSE (samples
, bias
);
1106 case CONVERT (A52_3F1R
, A52_3F
):
1109 mix21to2_SSE (samples
, samples
+ 512, bias
);
1112 case CONVERT (A52_3F2R
, A52_3F
):
1115 mix2to1_SSE (samples
, samples
+ 768, bias
);
1116 mix2to1_SSE (samples
+ 512, samples
+ 1024, bias
);
1119 case CONVERT (A52_3F1R
, A52_2F1R
):
1120 mix3to2_SSE (samples
, bias
);
1121 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1124 case CONVERT (A52_2F2R
, A52_2F1R
):
1125 mix2to1_SSE (samples
+ 512, samples
+ 768, bias
);
1128 case CONVERT (A52_3F2R
, A52_2F1R
):
1129 mix3to2_SSE (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
1130 move2to1_SSE (samples
+ 768, samples
+ 512, bias
);
1133 case CONVERT (A52_3F2R
, A52_3F1R
):
1134 mix2to1_SSE (samples
+ 768, samples
+ 1024, bias
);
1137 case CONVERT (A52_2F1R
, A52_2F2R
):
1138 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1141 case CONVERT (A52_3F1R
, A52_2F2R
):
1142 mix3to2_SSE (samples
, bias
);
1143 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1146 case CONVERT (A52_3F2R
, A52_2F2R
):
1147 mix3to2_SSE (samples
, bias
);
1148 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1149 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
1152 case CONVERT (A52_3F1R
, A52_3F2R
):
1153 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1158 static void upmix_MMX (sample_t
* samples
, int acmod
, int output
)
1160 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1162 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1163 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1166 case CONVERT (A52_3F2R
, A52_MONO
):
1167 zero_MMX (samples
+ 1024);
1168 case CONVERT (A52_3F1R
, A52_MONO
):
1169 case CONVERT (A52_2F2R
, A52_MONO
):
1170 zero_MMX (samples
+ 768);
1171 case CONVERT (A52_3F
, A52_MONO
):
1172 case CONVERT (A52_2F1R
, A52_MONO
):
1173 zero_MMX (samples
+ 512);
1174 case CONVERT (A52_CHANNEL
, A52_MONO
):
1175 case CONVERT (A52_STEREO
, A52_MONO
):
1176 zero_MMX (samples
+ 256);
1179 case CONVERT (A52_3F2R
, A52_STEREO
):
1180 case CONVERT (A52_3F2R
, A52_DOLBY
):
1181 zero_MMX (samples
+ 1024);
1182 case CONVERT (A52_3F1R
, A52_STEREO
):
1183 case CONVERT (A52_3F1R
, A52_DOLBY
):
1184 zero_MMX (samples
+ 768);
1185 case CONVERT (A52_3F
, A52_STEREO
):
1186 case CONVERT (A52_3F
, A52_DOLBY
):
1188 memcpy (samples
+ 512, samples
+ 256, 256 * sizeof (sample_t
));
1189 zero_MMX (samples
+ 256);
1192 case CONVERT (A52_2F2R
, A52_STEREO
):
1193 case CONVERT (A52_2F2R
, A52_DOLBY
):
1194 zero_MMX (samples
+ 768);
1195 case CONVERT (A52_2F1R
, A52_STEREO
):
1196 case CONVERT (A52_2F1R
, A52_DOLBY
):
1197 zero_MMX (samples
+ 512);
1200 case CONVERT (A52_3F2R
, A52_3F
):
1201 zero_MMX (samples
+ 1024);
1202 case CONVERT (A52_3F1R
, A52_3F
):
1203 case CONVERT (A52_2F2R
, A52_2F1R
):
1204 zero_MMX (samples
+ 768);
1207 case CONVERT (A52_3F2R
, A52_3F1R
):
1208 zero_MMX (samples
+ 1024);
1211 case CONVERT (A52_3F2R
, A52_2F1R
):
1212 zero_MMX (samples
+ 1024);
1213 case CONVERT (A52_3F1R
, A52_2F1R
):
1215 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1218 case CONVERT (A52_3F2R
, A52_2F2R
):
1219 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1220 goto mix_31to21_MMX
;
1224 static void mix2to1_3dnow (sample_t
* dest
, sample_t
* src
, sample_t bias
)
1227 "movd %2, %%mm7 \n\t"
1228 "punpckldq %2, %%mm7 \n\t"
1229 "mov $-1024, %%"REG_S
" \n\t"
1232 "movq (%0, %%"REG_S
"), %%mm0 \n\t"
1233 "movq 8(%0, %%"REG_S
"), %%mm1 \n\t"
1234 "movq 16(%0, %%"REG_S
"), %%mm2 \n\t"
1235 "movq 24(%0, %%"REG_S
"), %%mm3 \n\t"
1236 "pfadd (%1, %%"REG_S
"), %%mm0 \n\t"
1237 "pfadd 8(%1, %%"REG_S
"), %%mm1 \n\t"
1238 "pfadd 16(%1, %%"REG_S
"), %%mm2 \n\t"
1239 "pfadd 24(%1, %%"REG_S
"), %%mm3 \n\t"
1240 "pfadd %%mm7, %%mm0 \n\t"
1241 "pfadd %%mm7, %%mm1 \n\t"
1242 "pfadd %%mm7, %%mm2 \n\t"
1243 "pfadd %%mm7, %%mm3 \n\t"
1244 "movq %%mm0, (%1, %%"REG_S
") \n\t"
1245 "movq %%mm1, 8(%1, %%"REG_S
") \n\t"
1246 "movq %%mm2, 16(%1, %%"REG_S
") \n\t"
1247 "movq %%mm3, 24(%1, %%"REG_S
") \n\t"
1248 "add $32, %%"REG_S
" \n\t"
1250 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
1255 static void mix3to1_3dnow (sample_t
* samples
, sample_t bias
)
1258 "movd %1, %%mm7 \n\t"
1259 "punpckldq %1, %%mm7 \n\t"
1260 "mov $-1024, %%"REG_S
" \n\t"
1263 "movq (%0, %%"REG_S
"), %%mm0 \n\t"
1264 "movq 8(%0, %%"REG_S
"), %%mm1 \n\t"
1265 "movq 1024(%0, %%"REG_S
"), %%mm2\n\t"
1266 "movq 1032(%0, %%"REG_S
"), %%mm3\n\t"
1267 "pfadd 2048(%0, %%"REG_S
"), %%mm0\n\t"
1268 "pfadd 2056(%0, %%"REG_S
"), %%mm1\n\t"
1269 "pfadd %%mm7, %%mm0 \n\t"
1270 "pfadd %%mm7, %%mm1 \n\t"
1271 "pfadd %%mm2, %%mm0 \n\t"
1272 "pfadd %%mm3, %%mm1 \n\t"
1273 "movq %%mm0, (%0, %%"REG_S
") \n\t"
1274 "movq %%mm1, 8(%0, %%"REG_S
") \n\t"
1275 "add $16, %%"REG_S
" \n\t"
1277 :: "r" (samples
+256), "m" (bias
)
1282 static void mix4to1_3dnow (sample_t
* samples
, sample_t bias
)
1285 "movd %1, %%mm7 \n\t"
1286 "punpckldq %1, %%mm7 \n\t"
1287 "mov $-1024, %%"REG_S
" \n\t"
1290 "movq (%0, %%"REG_S
"), %%mm0 \n\t"
1291 "movq 8(%0, %%"REG_S
"), %%mm1 \n\t"
1292 "movq 1024(%0, %%"REG_S
"), %%mm2\n\t"
1293 "movq 1032(%0, %%"REG_S
"), %%mm3\n\t"
1294 "pfadd 2048(%0, %%"REG_S
"), %%mm0\n\t"
1295 "pfadd 2056(%0, %%"REG_S
"), %%mm1\n\t"
1296 "pfadd 3072(%0, %%"REG_S
"), %%mm2\n\t"
1297 "pfadd 3080(%0, %%"REG_S
"), %%mm3\n\t"
1298 "pfadd %%mm7, %%mm0 \n\t"
1299 "pfadd %%mm7, %%mm1 \n\t"
1300 "pfadd %%mm2, %%mm0 \n\t"
1301 "pfadd %%mm3, %%mm1 \n\t"
1302 "movq %%mm0, (%0, %%"REG_S
") \n\t"
1303 "movq %%mm1, 8(%0, %%"REG_S
") \n\t"
1304 "add $16, %%"REG_S
" \n\t"
1306 :: "r" (samples
+256), "m" (bias
)
1311 static void mix5to1_3dnow (sample_t
* samples
, sample_t bias
)
1314 "movd %1, %%mm7 \n\t"
1315 "punpckldq %1, %%mm7 \n\t"
1316 "mov $-1024, %%"REG_S
" \n\t"
1319 "movq (%0, %%"REG_S
"), %%mm0 \n\t"
1320 "movq 8(%0, %%"REG_S
"), %%mm1 \n\t"
1321 "movq 1024(%0, %%"REG_S
"), %%mm2\n\t"
1322 "movq 1032(%0, %%"REG_S
"), %%mm3\n\t"
1323 "pfadd 2048(%0, %%"REG_S
"), %%mm0\n\t"
1324 "pfadd 2056(%0, %%"REG_S
"), %%mm1\n\t"
1325 "pfadd 3072(%0, %%"REG_S
"), %%mm2\n\t"
1326 "pfadd 3080(%0, %%"REG_S
"), %%mm3\n\t"
1327 "pfadd %%mm7, %%mm0 \n\t"
1328 "pfadd %%mm7, %%mm1 \n\t"
1329 "pfadd 4096(%0, %%"REG_S
"), %%mm2\n\t"
1330 "pfadd 4104(%0, %%"REG_S
"), %%mm3\n\t"
1331 "pfadd %%mm2, %%mm0 \n\t"
1332 "pfadd %%mm3, %%mm1 \n\t"
1333 "movq %%mm0, (%0, %%"REG_S
") \n\t"
1334 "movq %%mm1, 8(%0, %%"REG_S
") \n\t"
1335 "add $16, %%"REG_S
" \n\t"
1337 :: "r" (samples
+256), "m" (bias
)
1342 static void mix3to2_3dnow (sample_t
* samples
, sample_t bias
)
1345 "movd %1, %%mm7 \n\t"
1346 "punpckldq %1, %%mm7 \n\t"
1347 "mov $-1024, %%"REG_S
" \n\t"
1350 "movq 1024(%0, %%"REG_S
"), %%mm0\n\t"
1351 "movq 1032(%0, %%"REG_S
"), %%mm1\n\t"
1352 "pfadd %%mm7, %%mm0 \n\t" //common
1353 "pfadd %%mm7, %%mm1 \n\t" //common
1354 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1355 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1356 "movq 2048(%0, %%"REG_S
"), %%mm4\n\t"
1357 "movq 2056(%0, %%"REG_S
"), %%mm5\n\t"
1358 "pfadd %%mm0, %%mm2 \n\t"
1359 "pfadd %%mm1, %%mm3 \n\t"
1360 "pfadd %%mm0, %%mm4 \n\t"
1361 "pfadd %%mm1, %%mm5 \n\t"
1362 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1363 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1364 "movq %%mm4, 1024(%0, %%"REG_S
")\n\t"
1365 "movq %%mm5, 1032(%0, %%"REG_S
")\n\t"
1366 "add $16, %%"REG_S
" \n\t"
1368 :: "r" (samples
+256), "m" (bias
)
1373 static void mix21to2_3dnow (sample_t
* left
, sample_t
* right
, sample_t bias
)
1376 "movd %2, %%mm7 \n\t"
1377 "punpckldq %2, %%mm7 \n\t"
1378 "mov $-1024, %%"REG_S
" \n\t"
1381 "movq 1024(%1, %%"REG_S
"), %%mm0\n\t"
1382 "movq 1032(%1, %%"REG_S
"), %%mm1\n\t"
1383 "pfadd %%mm7, %%mm0 \n\t" //common
1384 "pfadd %%mm7, %%mm1 \n\t" //common
1385 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1386 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1387 "movq (%1, %%"REG_S
"), %%mm4 \n\t"
1388 "movq 8(%1, %%"REG_S
"), %%mm5 \n\t"
1389 "pfadd %%mm0, %%mm2 \n\t"
1390 "pfadd %%mm1, %%mm3 \n\t"
1391 "pfadd %%mm0, %%mm4 \n\t"
1392 "pfadd %%mm1, %%mm5 \n\t"
1393 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1394 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1395 "movq %%mm4, (%1, %%"REG_S
") \n\t"
1396 "movq %%mm5, 8(%1, %%"REG_S
") \n\t"
1397 "add $16, %%"REG_S
" \n\t"
1399 :: "r" (left
+256), "r" (right
+256), "m" (bias
)
1404 static void mix21toS_3dnow (sample_t
* samples
, sample_t bias
)
1407 "movd %1, %%mm7 \n\t"
1408 "punpckldq %1, %%mm7 \n\t"
1409 "mov $-1024, %%"REG_S
" \n\t"
1412 "movq 2048(%0, %%"REG_S
"), %%mm0\n\t" // surround
1413 "movq 2056(%0, %%"REG_S
"), %%mm1\n\t" // surround
1414 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1415 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1416 "movq 1024(%0, %%"REG_S
"), %%mm4\n\t"
1417 "movq 1032(%0, %%"REG_S
"), %%mm5\n\t"
1418 "pfadd %%mm7, %%mm2 \n\t"
1419 "pfadd %%mm7, %%mm3 \n\t"
1420 "pfadd %%mm7, %%mm4 \n\t"
1421 "pfadd %%mm7, %%mm5 \n\t"
1422 "pfsub %%mm0, %%mm2 \n\t"
1423 "pfsub %%mm1, %%mm3 \n\t"
1424 "pfadd %%mm0, %%mm4 \n\t"
1425 "pfadd %%mm1, %%mm5 \n\t"
1426 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1427 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1428 "movq %%mm4, 1024(%0, %%"REG_S
")\n\t"
1429 "movq %%mm5, 1032(%0, %%"REG_S
")\n\t"
1430 "add $16, %%"REG_S
" \n\t"
1432 :: "r" (samples
+256), "m" (bias
)
1437 static void mix31to2_3dnow (sample_t
* samples
, sample_t bias
)
1440 "movd %1, %%mm7 \n\t"
1441 "punpckldq %1, %%mm7 \n\t"
1442 "mov $-1024, %%"REG_S
" \n\t"
1445 "movq 1024(%0, %%"REG_S
"), %%mm0\n\t"
1446 "movq 1032(%0, %%"REG_S
"), %%mm1\n\t"
1447 "pfadd 3072(%0, %%"REG_S
"), %%mm0\n\t"
1448 "pfadd 3080(%0, %%"REG_S
"), %%mm1\n\t"
1449 "pfadd %%mm7, %%mm0 \n\t" // common
1450 "pfadd %%mm7, %%mm1 \n\t" // common
1451 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1452 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1453 "movq 2048(%0, %%"REG_S
"), %%mm4\n\t"
1454 "movq 2056(%0, %%"REG_S
"), %%mm5\n\t"
1455 "pfadd %%mm0, %%mm2 \n\t"
1456 "pfadd %%mm1, %%mm3 \n\t"
1457 "pfadd %%mm0, %%mm4 \n\t"
1458 "pfadd %%mm1, %%mm5 \n\t"
1459 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1460 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1461 "movq %%mm4, 1024(%0, %%"REG_S
")\n\t"
1462 "movq %%mm5, 1032(%0, %%"REG_S
")\n\t"
1463 "add $16, %%"REG_S
" \n\t"
1465 :: "r" (samples
+256), "m" (bias
)
1470 static void mix31toS_3dnow (sample_t
* samples
, sample_t bias
)
1473 "movd %1, %%mm7 \n\t"
1474 "punpckldq %1, %%mm7 \n\t"
1475 "mov $-1024, %%"REG_S
" \n\t"
1478 "movq 1024(%0, %%"REG_S
"), %%mm0\n\t"
1479 "movq 1032(%0, %%"REG_S
"), %%mm1\n\t"
1480 "pfadd %%mm7, %%mm0 \n\t" // common
1481 "pfadd %%mm7, %%mm1 \n\t" // common
1482 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1483 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1484 "movq 2048(%0, %%"REG_S
"), %%mm4\n\t"
1485 "movq 2056(%0, %%"REG_S
"), %%mm5\n\t"
1486 "pfadd %%mm0, %%mm2 \n\t"
1487 "pfadd %%mm1, %%mm3 \n\t"
1488 "pfadd %%mm0, %%mm4 \n\t"
1489 "pfadd %%mm1, %%mm5 \n\t"
1490 "movq 3072(%0, %%"REG_S
"), %%mm0\n\t" // surround
1491 "movq 3080(%0, %%"REG_S
"), %%mm1\n\t" // surround
1492 "pfsub %%mm0, %%mm2 \n\t"
1493 "pfsub %%mm1, %%mm3 \n\t"
1494 "pfadd %%mm0, %%mm4 \n\t"
1495 "pfadd %%mm1, %%mm5 \n\t"
1496 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1497 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1498 "movq %%mm4, 1024(%0, %%"REG_S
")\n\t"
1499 "movq %%mm5, 1032(%0, %%"REG_S
")\n\t"
1500 "add $16, %%"REG_S
" \n\t"
1502 :: "r" (samples
+256), "m" (bias
)
1507 static void mix22toS_3dnow (sample_t
* samples
, sample_t bias
)
1510 "movd %1, %%mm7 \n\t"
1511 "punpckldq %1, %%mm7 \n\t"
1512 "mov $-1024, %%"REG_S
" \n\t"
1515 "movq 2048(%0, %%"REG_S
"), %%mm0\n\t"
1516 "movq 2056(%0, %%"REG_S
"), %%mm1\n\t"
1517 "pfadd 3072(%0, %%"REG_S
"), %%mm0\n\t" // surround
1518 "pfadd 3080(%0, %%"REG_S
"), %%mm1\n\t" // surround
1519 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1520 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1521 "movq 1024(%0, %%"REG_S
"), %%mm4\n\t"
1522 "movq 1032(%0, %%"REG_S
"), %%mm5\n\t"
1523 "pfadd %%mm7, %%mm2 \n\t"
1524 "pfadd %%mm7, %%mm3 \n\t"
1525 "pfadd %%mm7, %%mm4 \n\t"
1526 "pfadd %%mm7, %%mm5 \n\t"
1527 "pfsub %%mm0, %%mm2 \n\t"
1528 "pfsub %%mm1, %%mm3 \n\t"
1529 "pfadd %%mm0, %%mm4 \n\t"
1530 "pfadd %%mm1, %%mm5 \n\t"
1531 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1532 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1533 "movq %%mm4, 1024(%0, %%"REG_S
")\n\t"
1534 "movq %%mm5, 1032(%0, %%"REG_S
")\n\t"
1535 "add $16, %%"REG_S
" \n\t"
1537 :: "r" (samples
+256), "m" (bias
)
1542 static void mix32to2_3dnow (sample_t
* samples
, sample_t bias
)
1545 "movd %1, %%mm7 \n\t"
1546 "punpckldq %1, %%mm7 \n\t"
1547 "mov $-1024, %%"REG_S
" \n\t"
1550 "movq 1024(%0, %%"REG_S
"), %%mm0\n\t"
1551 "movq 1032(%0, %%"REG_S
"), %%mm1\n\t"
1552 "pfadd %%mm7, %%mm0 \n\t" // common
1553 "pfadd %%mm7, %%mm1 \n\t" // common
1554 "movq %%mm0, %%mm2 \n\t" // common
1555 "movq %%mm1, %%mm3 \n\t" // common
1556 "pfadd (%0, %%"REG_S
"), %%mm0 \n\t"
1557 "pfadd 8(%0, %%"REG_S
"), %%mm1 \n\t"
1558 "pfadd 2048(%0, %%"REG_S
"), %%mm2\n\t"
1559 "pfadd 2056(%0, %%"REG_S
"), %%mm3\n\t"
1560 "pfadd 3072(%0, %%"REG_S
"), %%mm0\n\t"
1561 "pfadd 3080(%0, %%"REG_S
"), %%mm1\n\t"
1562 "pfadd 4096(%0, %%"REG_S
"), %%mm2\n\t"
1563 "pfadd 4104(%0, %%"REG_S
"), %%mm3\n\t"
1564 "movq %%mm0, (%0, %%"REG_S
") \n\t"
1565 "movq %%mm1, 8(%0, %%"REG_S
") \n\t"
1566 "movq %%mm2, 1024(%0, %%"REG_S
")\n\t"
1567 "movq %%mm3, 1032(%0, %%"REG_S
")\n\t"
1568 "add $16, %%"REG_S
" \n\t"
1570 :: "r" (samples
+256), "m" (bias
)
1575 /* todo: should be optimized better */
1576 static void mix32toS_3dnow (sample_t
* samples
, sample_t bias
)
1579 "mov $-1024, %%"REG_S
" \n\t"
1582 "movd %1, %%mm7 \n\t"
1583 "punpckldq %1, %%mm7 \n\t"
1584 "movq 1024(%0, %%"REG_S
"), %%mm0\n\t"
1585 "movq 1032(%0, %%"REG_S
"), %%mm1\n\t"
1586 "movq 3072(%0, %%"REG_S
"), %%mm4\n\t"
1587 "movq 3080(%0, %%"REG_S
"), %%mm5\n\t"
1588 "pfadd %%mm7, %%mm0 \n\t" // common
1589 "pfadd %%mm7, %%mm1 \n\t" // common
1590 "pfadd 4096(%0, %%"REG_S
"), %%mm4\n\t" // surround
1591 "pfadd 4104(%0, %%"REG_S
"), %%mm5\n\t" // surround
1592 "movq (%0, %%"REG_S
"), %%mm2 \n\t"
1593 "movq 8(%0, %%"REG_S
"), %%mm3 \n\t"
1594 "movq 2048(%0, %%"REG_S
"), %%mm6\n\t"
1595 "movq 2056(%0, %%"REG_S
"), %%mm7\n\t"
1596 "pfsub %%mm4, %%mm2 \n\t"
1597 "pfsub %%mm5, %%mm3 \n\t"
1598 "pfadd %%mm4, %%mm6 \n\t"
1599 "pfadd %%mm5, %%mm7 \n\t"
1600 "pfadd %%mm0, %%mm2 \n\t"
1601 "pfadd %%mm1, %%mm3 \n\t"
1602 "pfadd %%mm0, %%mm6 \n\t"
1603 "pfadd %%mm1, %%mm7 \n\t"
1604 "movq %%mm2, (%0, %%"REG_S
") \n\t"
1605 "movq %%mm3, 8(%0, %%"REG_S
") \n\t"
1606 "movq %%mm6, 1024(%0, %%"REG_S
")\n\t"
1607 "movq %%mm7, 1032(%0, %%"REG_S
")\n\t"
1608 "add $16, %%"REG_S
" \n\t"
1610 :: "r" (samples
+256), "m" (bias
)
1615 static void move2to1_3dnow (sample_t
* src
, sample_t
* dest
, sample_t bias
)
1618 "movd %2, %%mm7 \n\t"
1619 "punpckldq %2, %%mm7 \n\t"
1620 "mov $-1024, %%"REG_S
" \n\t"
1623 "movq (%0, %%"REG_S
"), %%mm0 \n\t"
1624 "movq 8(%0, %%"REG_S
"), %%mm1 \n\t"
1625 "movq 16(%0, %%"REG_S
"), %%mm2 \n\t"
1626 "movq 24(%0, %%"REG_S
"), %%mm3 \n\t"
1627 "pfadd 1024(%0, %%"REG_S
"), %%mm0\n\t"
1628 "pfadd 1032(%0, %%"REG_S
"), %%mm1\n\t"
1629 "pfadd 1040(%0, %%"REG_S
"), %%mm2\n\t"
1630 "pfadd 1048(%0, %%"REG_S
"), %%mm3\n\t"
1631 "pfadd %%mm7, %%mm0 \n\t"
1632 "pfadd %%mm7, %%mm1 \n\t"
1633 "pfadd %%mm7, %%mm2 \n\t"
1634 "pfadd %%mm7, %%mm3 \n\t"
1635 "movq %%mm0, (%1, %%"REG_S
") \n\t"
1636 "movq %%mm1, 8(%1, %%"REG_S
") \n\t"
1637 "movq %%mm2, 16(%1, %%"REG_S
") \n\t"
1638 "movq %%mm3, 24(%1, %%"REG_S
") \n\t"
1639 "add $32, %%"REG_S
" \n\t"
1641 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
1646 static void downmix_3dnow (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
1647 sample_t clev
, sample_t slev
)
1649 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1651 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1652 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
1655 case CONVERT (A52_CHANNEL
, A52_MONO
):
1656 case CONVERT (A52_STEREO
, A52_MONO
):
1658 mix2to1_3dnow (samples
, samples
+ 256, bias
);
1661 case CONVERT (A52_2F1R
, A52_MONO
):
1663 goto mix_2to1_3dnow
;
1664 case CONVERT (A52_3F
, A52_MONO
):
1666 mix3to1_3dnow (samples
, bias
);
1669 case CONVERT (A52_3F1R
, A52_MONO
):
1671 goto mix_3to1_3dnow
;
1672 case CONVERT (A52_2F2R
, A52_MONO
):
1674 goto mix_2to1_3dnow
;
1675 mix4to1_3dnow (samples
, bias
);
1678 case CONVERT (A52_3F2R
, A52_MONO
):
1680 goto mix_3to1_3dnow
;
1681 mix5to1_3dnow (samples
, bias
);
1684 case CONVERT (A52_MONO
, A52_DOLBY
):
1685 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1688 case CONVERT (A52_3F
, A52_STEREO
):
1689 case CONVERT (A52_3F
, A52_DOLBY
):
1691 mix3to2_3dnow (samples
, bias
);
1694 case CONVERT (A52_2F1R
, A52_STEREO
):
1697 mix21to2_3dnow (samples
, samples
+ 256, bias
);
1700 case CONVERT (A52_2F1R
, A52_DOLBY
):
1701 mix21toS_3dnow (samples
, bias
);
1704 case CONVERT (A52_3F1R
, A52_STEREO
):
1706 goto mix_3to2_3dnow
;
1707 mix31to2_3dnow (samples
, bias
);
1710 case CONVERT (A52_3F1R
, A52_DOLBY
):
1711 mix31toS_3dnow (samples
, bias
);
1714 case CONVERT (A52_2F2R
, A52_STEREO
):
1717 mix2to1_3dnow (samples
, samples
+ 512, bias
);
1718 mix2to1_3dnow (samples
+ 256, samples
+ 768, bias
);
1721 case CONVERT (A52_2F2R
, A52_DOLBY
):
1722 mix22toS_3dnow (samples
, bias
);
1725 case CONVERT (A52_3F2R
, A52_STEREO
):
1727 goto mix_3to2_3dnow
;
1728 mix32to2_3dnow (samples
, bias
);
1731 case CONVERT (A52_3F2R
, A52_DOLBY
):
1732 mix32toS_3dnow (samples
, bias
);
1735 case CONVERT (A52_3F1R
, A52_3F
):
1738 mix21to2_3dnow (samples
, samples
+ 512, bias
);
1741 case CONVERT (A52_3F2R
, A52_3F
):
1744 mix2to1_3dnow (samples
, samples
+ 768, bias
);
1745 mix2to1_3dnow (samples
+ 512, samples
+ 1024, bias
);
1748 case CONVERT (A52_3F1R
, A52_2F1R
):
1749 mix3to2_3dnow (samples
, bias
);
1750 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1753 case CONVERT (A52_2F2R
, A52_2F1R
):
1754 mix2to1_3dnow (samples
+ 512, samples
+ 768, bias
);
1757 case CONVERT (A52_3F2R
, A52_2F1R
):
1758 mix3to2_3dnow (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
1759 move2to1_3dnow (samples
+ 768, samples
+ 512, bias
);
1762 case CONVERT (A52_3F2R
, A52_3F1R
):
1763 mix2to1_3dnow (samples
+ 768, samples
+ 1024, bias
);
1766 case CONVERT (A52_2F1R
, A52_2F2R
):
1767 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1770 case CONVERT (A52_3F1R
, A52_2F2R
):
1771 mix3to2_3dnow (samples
, bias
);
1772 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1775 case CONVERT (A52_3F2R
, A52_2F2R
):
1776 mix3to2_3dnow (samples
, bias
);
1777 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1778 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
1781 case CONVERT (A52_3F1R
, A52_3F2R
):
1782 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1785 __asm__
volatile("femms":::"memory");
1788 #endif // ARCH_X86 || ARCH_X86_64