3 * Copyright (C) 2000-2001 Michel Lespinasse <walken@zoy.org>
4 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
6 * This file is part of a52dec, a free ATSC A-52 stream decoder.
7 * See http://liba52.sourceforge.net/ for updates.
9 * Modified for use with MPlayer, changes contained in liba52_changes.diff.
10 * detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
13 * a52dec is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * a52dec is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with this program; if not, write to the Free Software
25 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
36 #include "a52_internal.h"
39 #define CONVERT(acmod,output) (((output) << 3) + (acmod))
42 void (*downmix
)(sample_t
* samples
, int acmod
, int output
, sample_t bias
,
43 sample_t clev
, sample_t slev
)= NULL
;
44 void (*upmix
)(sample_t
* samples
, int acmod
, int output
)= NULL
;
46 static void downmix_SSE (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
47 sample_t clev
, sample_t slev
);
48 static void downmix_3dnow (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
49 sample_t clev
, sample_t slev
);
50 static void downmix_C (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
51 sample_t clev
, sample_t slev
);
52 static void upmix_MMX (sample_t
* samples
, int acmod
, int output
);
53 static void upmix_C (sample_t
* samples
, int acmod
, int output
);
55 void downmix_accel_init(uint32_t mm_accel
)
60 if(mm_accel
& MM_ACCEL_X86_MMX
) upmix
= upmix_MMX
;
61 if(mm_accel
& MM_ACCEL_X86_SSE
) downmix
= downmix_SSE
;
62 if(mm_accel
& MM_ACCEL_X86_3DNOW
) downmix
= downmix_3dnow
;
66 int downmix_init (int input
, int flags
, sample_t
* level
,
67 sample_t clev
, sample_t slev
)
69 static uint8_t table
[11][8] = {
70 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
71 A52_STEREO
, A52_STEREO
, A52_STEREO
, A52_STEREO
},
72 {A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
,
73 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
74 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
75 A52_STEREO
, A52_STEREO
, A52_STEREO
, A52_STEREO
},
76 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
77 A52_STEREO
, A52_3F
, A52_STEREO
, A52_3F
},
78 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
79 A52_2F1R
, A52_2F1R
, A52_2F1R
, A52_2F1R
},
80 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_STEREO
,
81 A52_2F1R
, A52_3F1R
, A52_2F1R
, A52_3F1R
},
82 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
83 A52_2F2R
, A52_2F2R
, A52_2F2R
, A52_2F2R
},
84 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_3F
,
85 A52_2F2R
, A52_3F2R
, A52_2F2R
, A52_3F2R
},
86 {A52_CHANNEL1
, A52_MONO
, A52_MONO
, A52_MONO
,
87 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
88 {A52_CHANNEL2
, A52_MONO
, A52_MONO
, A52_MONO
,
89 A52_MONO
, A52_MONO
, A52_MONO
, A52_MONO
},
90 {A52_CHANNEL
, A52_DOLBY
, A52_STEREO
, A52_DOLBY
,
91 A52_DOLBY
, A52_DOLBY
, A52_DOLBY
, A52_DOLBY
}
95 output
= flags
& A52_CHANNEL_MASK
;
96 if (output
> A52_DOLBY
)
99 output
= table
[output
][input
& 7];
101 if ((output
== A52_STEREO
) &&
102 ((input
== A52_DOLBY
) || ((input
== A52_3F
) && (clev
== LEVEL_3DB
))))
105 if (flags
& A52_ADJUST_LEVEL
)
106 switch (CONVERT (input
& 7, output
)) {
108 case CONVERT (A52_3F
, A52_MONO
):
109 *level
*= LEVEL_3DB
/ (1 + clev
);
112 case CONVERT (A52_STEREO
, A52_MONO
):
113 case CONVERT (A52_2F2R
, A52_2F1R
):
114 case CONVERT (A52_3F2R
, A52_3F1R
):
119 case CONVERT (A52_3F2R
, A52_2F1R
):
120 if (clev
< LEVEL_PLUS3DB
- 1)
123 case CONVERT (A52_3F
, A52_STEREO
):
124 case CONVERT (A52_3F1R
, A52_2F1R
):
125 case CONVERT (A52_3F1R
, A52_2F2R
):
126 case CONVERT (A52_3F2R
, A52_2F2R
):
130 case CONVERT (A52_2F1R
, A52_MONO
):
131 *level
*= LEVEL_PLUS3DB
/ (2 + slev
);
134 case CONVERT (A52_2F1R
, A52_STEREO
):
135 case CONVERT (A52_3F1R
, A52_3F
):
136 *level
/= 1 + slev
* LEVEL_3DB
;
139 case CONVERT (A52_3F1R
, A52_MONO
):
140 *level
*= LEVEL_3DB
/ (1 + clev
+ 0.5 * slev
);
143 case CONVERT (A52_3F1R
, A52_STEREO
):
144 *level
/= 1 + clev
+ slev
* LEVEL_3DB
;
147 case CONVERT (A52_2F2R
, A52_MONO
):
148 *level
*= LEVEL_3DB
/ (1 + slev
);
151 case CONVERT (A52_2F2R
, A52_STEREO
):
152 case CONVERT (A52_3F2R
, A52_3F
):
156 case CONVERT (A52_3F2R
, A52_MONO
):
157 *level
*= LEVEL_3DB
/ (1 + clev
+ slev
);
160 case CONVERT (A52_3F2R
, A52_STEREO
):
161 *level
/= 1 + clev
+ slev
;
164 case CONVERT (A52_MONO
, A52_DOLBY
):
165 *level
*= LEVEL_PLUS3DB
;
168 case CONVERT (A52_3F
, A52_DOLBY
):
169 case CONVERT (A52_2F1R
, A52_DOLBY
):
170 *level
*= 1 / (1 + LEVEL_3DB
);
173 case CONVERT (A52_3F1R
, A52_DOLBY
):
174 case CONVERT (A52_2F2R
, A52_DOLBY
):
175 *level
*= 1 / (1 + 2 * LEVEL_3DB
);
178 case CONVERT (A52_3F2R
, A52_DOLBY
):
179 *level
*= 1 / (1 + 3 * LEVEL_3DB
);
185 int downmix_coeff (sample_t
* coeff
, int acmod
, int output
, sample_t level
,
186 sample_t clev
, sample_t slev
)
188 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
190 case CONVERT (A52_CHANNEL
, A52_CHANNEL
):
191 case CONVERT (A52_MONO
, A52_MONO
):
192 case CONVERT (A52_STEREO
, A52_STEREO
):
193 case CONVERT (A52_3F
, A52_3F
):
194 case CONVERT (A52_2F1R
, A52_2F1R
):
195 case CONVERT (A52_3F1R
, A52_3F1R
):
196 case CONVERT (A52_2F2R
, A52_2F2R
):
197 case CONVERT (A52_3F2R
, A52_3F2R
):
198 case CONVERT (A52_STEREO
, A52_DOLBY
):
199 coeff
[0] = coeff
[1] = coeff
[2] = coeff
[3] = coeff
[4] = level
;
202 case CONVERT (A52_CHANNEL
, A52_MONO
):
203 coeff
[0] = coeff
[1] = level
* LEVEL_6DB
;
206 case CONVERT (A52_STEREO
, A52_MONO
):
207 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
210 case CONVERT (A52_3F
, A52_MONO
):
211 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
212 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
215 case CONVERT (A52_2F1R
, A52_MONO
):
216 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
217 coeff
[2] = level
* slev
* LEVEL_3DB
;
220 case CONVERT (A52_2F2R
, A52_MONO
):
221 coeff
[0] = coeff
[1] = level
* LEVEL_3DB
;
222 coeff
[2] = coeff
[3] = level
* slev
* LEVEL_3DB
;
225 case CONVERT (A52_3F1R
, A52_MONO
):
226 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
227 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
228 coeff
[3] = level
* slev
* LEVEL_3DB
;
231 case CONVERT (A52_3F2R
, A52_MONO
):
232 coeff
[0] = coeff
[2] = level
* LEVEL_3DB
;
233 coeff
[1] = level
* clev
* LEVEL_PLUS3DB
;
234 coeff
[3] = coeff
[4] = level
* slev
* LEVEL_3DB
;
237 case CONVERT (A52_MONO
, A52_DOLBY
):
238 coeff
[0] = level
* LEVEL_3DB
;
241 case CONVERT (A52_3F
, A52_DOLBY
):
243 case CONVERT (A52_3F
, A52_STEREO
):
244 case CONVERT (A52_3F1R
, A52_2F1R
):
245 case CONVERT (A52_3F2R
, A52_2F2R
):
246 coeff
[0] = coeff
[2] = coeff
[3] = coeff
[4] = level
;
247 coeff
[1] = level
* clev
;
250 case CONVERT (A52_2F1R
, A52_DOLBY
):
252 case CONVERT (A52_2F1R
, A52_STEREO
):
253 coeff
[0] = coeff
[1] = level
;
254 coeff
[2] = level
* slev
* LEVEL_3DB
;
257 case CONVERT (A52_3F1R
, A52_DOLBY
):
260 case CONVERT (A52_3F1R
, A52_STEREO
):
261 coeff
[0] = coeff
[2] = level
;
262 coeff
[1] = level
* clev
;
263 coeff
[3] = level
* slev
* LEVEL_3DB
;
266 case CONVERT (A52_2F2R
, A52_DOLBY
):
268 case CONVERT (A52_2F2R
, A52_STEREO
):
269 coeff
[0] = coeff
[1] = level
;
270 coeff
[2] = coeff
[3] = level
* slev
;
273 case CONVERT (A52_3F2R
, A52_DOLBY
):
275 case CONVERT (A52_3F2R
, A52_2F1R
):
277 case CONVERT (A52_3F2R
, A52_STEREO
):
278 coeff
[0] = coeff
[2] = level
;
279 coeff
[1] = level
* clev
;
280 coeff
[3] = coeff
[4] = level
* slev
;
283 case CONVERT (A52_3F1R
, A52_3F
):
284 coeff
[0] = coeff
[1] = coeff
[2] = level
;
285 coeff
[3] = level
* slev
* LEVEL_3DB
;
288 case CONVERT (A52_3F2R
, A52_3F
):
289 coeff
[0] = coeff
[1] = coeff
[2] = level
;
290 coeff
[3] = coeff
[4] = level
* slev
;
293 case CONVERT (A52_2F2R
, A52_2F1R
):
294 coeff
[0] = coeff
[1] = level
;
295 coeff
[2] = coeff
[3] = level
* LEVEL_3DB
;
298 case CONVERT (A52_3F2R
, A52_3F1R
):
299 coeff
[0] = coeff
[1] = coeff
[2] = level
;
300 coeff
[3] = coeff
[4] = level
* LEVEL_3DB
;
303 case CONVERT (A52_2F1R
, A52_2F2R
):
304 coeff
[0] = coeff
[1] = level
;
305 coeff
[2] = level
* LEVEL_3DB
;
308 case CONVERT (A52_3F1R
, A52_2F2R
):
309 coeff
[0] = coeff
[2] = level
;
310 coeff
[1] = level
* clev
;
311 coeff
[3] = level
* LEVEL_3DB
;
314 case CONVERT (A52_3F1R
, A52_3F2R
):
315 coeff
[0] = coeff
[1] = coeff
[2] = level
;
316 coeff
[3] = level
* LEVEL_3DB
;
319 case CONVERT (A52_CHANNEL
, A52_CHANNEL1
):
324 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
330 return -1; /* NOTREACHED */
333 static void mix2to1 (sample_t
* dest
, sample_t
* src
, sample_t bias
)
337 for (i
= 0; i
< 256; i
++)
338 dest
[i
] += src
[i
] + bias
;
341 static void mix3to1 (sample_t
* samples
, sample_t bias
)
345 for (i
= 0; i
< 256; i
++)
346 samples
[i
] += samples
[i
+ 256] + samples
[i
+ 512] + bias
;
349 static void mix4to1 (sample_t
* samples
, sample_t bias
)
353 for (i
= 0; i
< 256; i
++)
354 samples
[i
] += (samples
[i
+ 256] + samples
[i
+ 512] +
355 samples
[i
+ 768] + bias
);
358 static void mix5to1 (sample_t
* samples
, sample_t bias
)
362 for (i
= 0; i
< 256; i
++)
363 samples
[i
] += (samples
[i
+ 256] + samples
[i
+ 512] +
364 samples
[i
+ 768] + samples
[i
+ 1024] + bias
);
367 static void mix3to2 (sample_t
* samples
, sample_t bias
)
372 for (i
= 0; i
< 256; i
++) {
373 common
= samples
[i
+ 256] + bias
;
374 samples
[i
] += common
;
375 samples
[i
+ 256] = samples
[i
+ 512] + common
;
379 static void mix21to2 (sample_t
* left
, sample_t
* right
, sample_t bias
)
384 for (i
= 0; i
< 256; i
++) {
385 common
= right
[i
+ 256] + bias
;
391 static void mix21toS (sample_t
* samples
, sample_t bias
)
396 for (i
= 0; i
< 256; i
++) {
397 surround
= samples
[i
+ 512];
398 samples
[i
] += bias
- surround
;
399 samples
[i
+ 256] += bias
+ surround
;
403 static void mix31to2 (sample_t
* samples
, sample_t bias
)
408 for (i
= 0; i
< 256; i
++) {
409 common
= samples
[i
+ 256] + samples
[i
+ 768] + bias
;
410 samples
[i
] += common
;
411 samples
[i
+ 256] = samples
[i
+ 512] + common
;
415 static void mix31toS (sample_t
* samples
, sample_t bias
)
418 sample_t common
, surround
;
420 for (i
= 0; i
< 256; i
++) {
421 common
= samples
[i
+ 256] + bias
;
422 surround
= samples
[i
+ 768];
423 samples
[i
] += common
- surround
;
424 samples
[i
+ 256] = samples
[i
+ 512] + common
+ surround
;
428 static void mix22toS (sample_t
* samples
, sample_t bias
)
433 for (i
= 0; i
< 256; i
++) {
434 surround
= samples
[i
+ 512] + samples
[i
+ 768];
435 samples
[i
] += bias
- surround
;
436 samples
[i
+ 256] += bias
+ surround
;
440 static void mix32to2 (sample_t
* samples
, sample_t bias
)
445 for (i
= 0; i
< 256; i
++) {
446 common
= samples
[i
+ 256] + bias
;
447 samples
[i
] += common
+ samples
[i
+ 768];
448 samples
[i
+ 256] = common
+ samples
[i
+ 512] + samples
[i
+ 1024];
452 static void mix32toS (sample_t
* samples
, sample_t bias
)
455 sample_t common
, surround
;
457 for (i
= 0; i
< 256; i
++) {
458 common
= samples
[i
+ 256] + bias
;
459 surround
= samples
[i
+ 768] + samples
[i
+ 1024];
460 samples
[i
] += common
- surround
;
461 samples
[i
+ 256] = samples
[i
+ 512] + common
+ surround
;
465 static void move2to1 (sample_t
* src
, sample_t
* dest
, sample_t bias
)
469 for (i
= 0; i
< 256; i
++)
470 dest
[i
] = src
[i
] + src
[i
+ 256] + bias
;
473 static void zero (sample_t
* samples
)
476 for (i
= 0; i
< 256; i
++)
480 static void downmix_C (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
481 sample_t clev
, sample_t slev
)
483 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
485 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
486 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
489 case CONVERT (A52_CHANNEL
, A52_MONO
):
490 case CONVERT (A52_STEREO
, A52_MONO
):
492 mix2to1 (samples
, samples
+ 256, bias
);
495 case CONVERT (A52_2F1R
, A52_MONO
):
498 case CONVERT (A52_3F
, A52_MONO
):
500 mix3to1 (samples
, bias
);
503 case CONVERT (A52_3F1R
, A52_MONO
):
506 case CONVERT (A52_2F2R
, A52_MONO
):
509 mix4to1 (samples
, bias
);
512 case CONVERT (A52_3F2R
, A52_MONO
):
515 mix5to1 (samples
, bias
);
518 case CONVERT (A52_MONO
, A52_DOLBY
):
519 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
522 case CONVERT (A52_3F
, A52_STEREO
):
523 case CONVERT (A52_3F
, A52_DOLBY
):
525 mix3to2 (samples
, bias
);
528 case CONVERT (A52_2F1R
, A52_STEREO
):
531 mix21to2 (samples
, samples
+ 256, bias
);
534 case CONVERT (A52_2F1R
, A52_DOLBY
):
535 mix21toS (samples
, bias
);
538 case CONVERT (A52_3F1R
, A52_STEREO
):
541 mix31to2 (samples
, bias
);
544 case CONVERT (A52_3F1R
, A52_DOLBY
):
545 mix31toS (samples
, bias
);
548 case CONVERT (A52_2F2R
, A52_STEREO
):
551 mix2to1 (samples
, samples
+ 512, bias
);
552 mix2to1 (samples
+ 256, samples
+ 768, bias
);
555 case CONVERT (A52_2F2R
, A52_DOLBY
):
556 mix22toS (samples
, bias
);
559 case CONVERT (A52_3F2R
, A52_STEREO
):
562 mix32to2 (samples
, bias
);
565 case CONVERT (A52_3F2R
, A52_DOLBY
):
566 mix32toS (samples
, bias
);
569 case CONVERT (A52_3F1R
, A52_3F
):
572 mix21to2 (samples
, samples
+ 512, bias
);
575 case CONVERT (A52_3F2R
, A52_3F
):
578 mix2to1 (samples
, samples
+ 768, bias
);
579 mix2to1 (samples
+ 512, samples
+ 1024, bias
);
582 case CONVERT (A52_3F1R
, A52_2F1R
):
583 mix3to2 (samples
, bias
);
584 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
587 case CONVERT (A52_2F2R
, A52_2F1R
):
588 mix2to1 (samples
+ 512, samples
+ 768, bias
);
591 case CONVERT (A52_3F2R
, A52_2F1R
):
592 mix3to2 (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
593 move2to1 (samples
+ 768, samples
+ 512, bias
);
596 case CONVERT (A52_3F2R
, A52_3F1R
):
597 mix2to1 (samples
+ 768, samples
+ 1024, bias
);
600 case CONVERT (A52_2F1R
, A52_2F2R
):
601 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
604 case CONVERT (A52_3F1R
, A52_2F2R
):
605 mix3to2 (samples
, bias
);
606 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
609 case CONVERT (A52_3F2R
, A52_2F2R
):
610 mix3to2 (samples
, bias
);
611 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
612 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
615 case CONVERT (A52_3F1R
, A52_3F2R
):
616 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
621 static void upmix_C (sample_t
* samples
, int acmod
, int output
)
623 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
625 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
626 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
629 case CONVERT (A52_3F2R
, A52_MONO
):
630 zero (samples
+ 1024);
631 case CONVERT (A52_3F1R
, A52_MONO
):
632 case CONVERT (A52_2F2R
, A52_MONO
):
633 zero (samples
+ 768);
634 case CONVERT (A52_3F
, A52_MONO
):
635 case CONVERT (A52_2F1R
, A52_MONO
):
636 zero (samples
+ 512);
637 case CONVERT (A52_CHANNEL
, A52_MONO
):
638 case CONVERT (A52_STEREO
, A52_MONO
):
639 zero (samples
+ 256);
642 case CONVERT (A52_3F2R
, A52_STEREO
):
643 case CONVERT (A52_3F2R
, A52_DOLBY
):
644 zero (samples
+ 1024);
645 case CONVERT (A52_3F1R
, A52_STEREO
):
646 case CONVERT (A52_3F1R
, A52_DOLBY
):
647 zero (samples
+ 768);
648 case CONVERT (A52_3F
, A52_STEREO
):
649 case CONVERT (A52_3F
, A52_DOLBY
):
651 memcpy (samples
+ 512, samples
+ 256, 256 * sizeof (sample_t
));
652 zero (samples
+ 256);
655 case CONVERT (A52_2F2R
, A52_STEREO
):
656 case CONVERT (A52_2F2R
, A52_DOLBY
):
657 zero (samples
+ 768);
658 case CONVERT (A52_2F1R
, A52_STEREO
):
659 case CONVERT (A52_2F1R
, A52_DOLBY
):
660 zero (samples
+ 512);
663 case CONVERT (A52_3F2R
, A52_3F
):
664 zero (samples
+ 1024);
665 case CONVERT (A52_3F1R
, A52_3F
):
666 case CONVERT (A52_2F2R
, A52_2F1R
):
667 zero (samples
+ 768);
670 case CONVERT (A52_3F2R
, A52_3F1R
):
671 zero (samples
+ 1024);
674 case CONVERT (A52_3F2R
, A52_2F1R
):
675 zero (samples
+ 1024);
676 case CONVERT (A52_3F1R
, A52_2F1R
):
678 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
681 case CONVERT (A52_3F2R
, A52_2F2R
):
682 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
688 static void mix2to1_SSE (sample_t
* dest
, sample_t
* src
, sample_t bias
)
691 "movlps %2, %%xmm7 \n\t"
692 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
693 "movl $-1024, %%esi \n\t"
696 "movaps (%0, %%esi), %%xmm0 \n\t"
697 "movaps 16(%0, %%esi), %%xmm1 \n\t"
698 "addps (%1, %%esi), %%xmm0 \n\t"
699 "addps 16(%1, %%esi), %%xmm1 \n\t"
700 "addps %%xmm7, %%xmm0 \n\t"
701 "addps %%xmm7, %%xmm1 \n\t"
702 "movaps %%xmm0, (%1, %%esi) \n\t"
703 "movaps %%xmm1, 16(%1, %%esi) \n\t"
704 "addl $32, %%esi \n\t"
706 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
711 static void mix3to1_SSE (sample_t
* samples
, sample_t bias
)
714 "movlps %1, %%xmm7 \n\t"
715 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
716 "movl $-1024, %%esi \n\t"
719 "movaps (%0, %%esi), %%xmm0 \n\t"
720 "movaps 1024(%0, %%esi), %%xmm1 \n\t"
721 "addps 2048(%0, %%esi), %%xmm0 \n\t"
722 "addps %%xmm7, %%xmm1 \n\t"
723 "addps %%xmm1, %%xmm0 \n\t"
724 "movaps %%xmm0, (%0, %%esi) \n\t"
725 "addl $16, %%esi \n\t"
727 :: "r" (samples
+256), "m" (bias
)
732 static void mix4to1_SSE (sample_t
* samples
, sample_t bias
)
735 "movlps %1, %%xmm7 \n\t"
736 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
737 "movl $-1024, %%esi \n\t"
740 "movaps (%0, %%esi), %%xmm0 \n\t"
741 "movaps 1024(%0, %%esi), %%xmm1 \n\t"
742 "addps 2048(%0, %%esi), %%xmm0 \n\t"
743 "addps 3072(%0, %%esi), %%xmm1 \n\t"
744 "addps %%xmm7, %%xmm0 \n\t"
745 "addps %%xmm1, %%xmm0 \n\t"
746 "movaps %%xmm0, (%0, %%esi) \n\t"
747 "addl $16, %%esi \n\t"
749 :: "r" (samples
+256), "m" (bias
)
754 static void mix5to1_SSE (sample_t
* samples
, sample_t bias
)
757 "movlps %1, %%xmm7 \n\t"
758 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
759 "movl $-1024, %%esi \n\t"
762 "movaps (%0, %%esi), %%xmm0 \n\t"
763 "movaps 1024(%0, %%esi), %%xmm1 \n\t"
764 "addps 2048(%0, %%esi), %%xmm0 \n\t"
765 "addps 3072(%0, %%esi), %%xmm1 \n\t"
766 "addps %%xmm7, %%xmm0 \n\t"
767 "addps 4096(%0, %%esi), %%xmm1 \n\t"
768 "addps %%xmm1, %%xmm0 \n\t"
769 "movaps %%xmm0, (%0, %%esi) \n\t"
770 "addl $16, %%esi \n\t"
772 :: "r" (samples
+256), "m" (bias
)
777 static void mix3to2_SSE (sample_t
* samples
, sample_t bias
)
780 "movlps %1, %%xmm7 \n\t"
781 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
782 "movl $-1024, %%esi \n\t"
785 "movaps 1024(%0, %%esi), %%xmm0 \n\t"
786 "addps %%xmm7, %%xmm0 \n\t" //common
787 "movaps (%0, %%esi), %%xmm1 \n\t"
788 "movaps 2048(%0, %%esi), %%xmm2 \n\t"
789 "addps %%xmm0, %%xmm1 \n\t"
790 "addps %%xmm0, %%xmm2 \n\t"
791 "movaps %%xmm1, (%0, %%esi) \n\t"
792 "movaps %%xmm2, 1024(%0, %%esi) \n\t"
793 "addl $16, %%esi \n\t"
795 :: "r" (samples
+256), "m" (bias
)
800 static void mix21to2_SSE (sample_t
* left
, sample_t
* right
, sample_t bias
)
803 "movlps %2, %%xmm7 \n\t"
804 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
805 "movl $-1024, %%esi \n\t"
808 "movaps 1024(%1, %%esi), %%xmm0 \n\t"
809 "addps %%xmm7, %%xmm0 \n\t" //common
810 "movaps (%0, %%esi), %%xmm1 \n\t"
811 "movaps (%1, %%esi), %%xmm2 \n\t"
812 "addps %%xmm0, %%xmm1 \n\t"
813 "addps %%xmm0, %%xmm2 \n\t"
814 "movaps %%xmm1, (%0, %%esi) \n\t"
815 "movaps %%xmm2, (%1, %%esi) \n\t"
816 "addl $16, %%esi \n\t"
818 :: "r" (left
+256), "r" (right
+256), "m" (bias
)
823 static void mix21toS_SSE (sample_t
* samples
, sample_t bias
)
826 "movlps %1, %%xmm7 \n\t"
827 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
828 "movl $-1024, %%esi \n\t"
831 "movaps 2048(%0, %%esi), %%xmm0 \n\t" // surround
832 "movaps (%0, %%esi), %%xmm1 \n\t"
833 "movaps 1024(%0, %%esi), %%xmm2 \n\t"
834 "addps %%xmm7, %%xmm1 \n\t"
835 "addps %%xmm7, %%xmm2 \n\t"
836 "subps %%xmm0, %%xmm1 \n\t"
837 "addps %%xmm0, %%xmm2 \n\t"
838 "movaps %%xmm1, (%0, %%esi) \n\t"
839 "movaps %%xmm2, 1024(%0, %%esi) \n\t"
840 "addl $16, %%esi \n\t"
842 :: "r" (samples
+256), "m" (bias
)
847 static void mix31to2_SSE (sample_t
* samples
, sample_t bias
)
850 "movlps %1, %%xmm7 \n\t"
851 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
852 "movl $-1024, %%esi \n\t"
855 "movaps 1024(%0, %%esi), %%xmm0 \n\t"
856 "addps 3072(%0, %%esi), %%xmm0 \n\t"
857 "addps %%xmm7, %%xmm0 \n\t" // common
858 "movaps (%0, %%esi), %%xmm1 \n\t"
859 "movaps 2048(%0, %%esi), %%xmm2 \n\t"
860 "addps %%xmm0, %%xmm1 \n\t"
861 "addps %%xmm0, %%xmm2 \n\t"
862 "movaps %%xmm1, (%0, %%esi) \n\t"
863 "movaps %%xmm2, 1024(%0, %%esi) \n\t"
864 "addl $16, %%esi \n\t"
866 :: "r" (samples
+256), "m" (bias
)
871 static void mix31toS_SSE (sample_t
* samples
, sample_t bias
)
874 "movlps %1, %%xmm7 \n\t"
875 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
876 "movl $-1024, %%esi \n\t"
879 "movaps 1024(%0, %%esi), %%xmm0 \n\t"
880 "movaps 3072(%0, %%esi), %%xmm3 \n\t" // surround
881 "addps %%xmm7, %%xmm0 \n\t" // common
882 "movaps (%0, %%esi), %%xmm1 \n\t"
883 "movaps 2048(%0, %%esi), %%xmm2 \n\t"
884 "addps %%xmm0, %%xmm1 \n\t"
885 "addps %%xmm0, %%xmm2 \n\t"
886 "subps %%xmm3, %%xmm1 \n\t"
887 "addps %%xmm3, %%xmm2 \n\t"
888 "movaps %%xmm1, (%0, %%esi) \n\t"
889 "movaps %%xmm2, 1024(%0, %%esi) \n\t"
890 "addl $16, %%esi \n\t"
892 :: "r" (samples
+256), "m" (bias
)
897 static void mix22toS_SSE (sample_t
* samples
, sample_t bias
)
900 "movlps %1, %%xmm7 \n\t"
901 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
902 "movl $-1024, %%esi \n\t"
905 "movaps 2048(%0, %%esi), %%xmm0 \n\t"
906 "addps 3072(%0, %%esi), %%xmm0 \n\t" // surround
907 "movaps (%0, %%esi), %%xmm1 \n\t"
908 "movaps 1024(%0, %%esi), %%xmm2 \n\t"
909 "addps %%xmm7, %%xmm1 \n\t"
910 "addps %%xmm7, %%xmm2 \n\t"
911 "subps %%xmm0, %%xmm1 \n\t"
912 "addps %%xmm0, %%xmm2 \n\t"
913 "movaps %%xmm1, (%0, %%esi) \n\t"
914 "movaps %%xmm2, 1024(%0, %%esi) \n\t"
915 "addl $16, %%esi \n\t"
917 :: "r" (samples
+256), "m" (bias
)
922 static void mix32to2_SSE (sample_t
* samples
, sample_t bias
)
925 "movlps %1, %%xmm7 \n\t"
926 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
927 "movl $-1024, %%esi \n\t"
930 "movaps 1024(%0, %%esi), %%xmm0 \n\t"
931 "addps %%xmm7, %%xmm0 \n\t" // common
932 "movaps %%xmm0, %%xmm1 \n\t" // common
933 "addps (%0, %%esi), %%xmm0 \n\t"
934 "addps 2048(%0, %%esi), %%xmm1 \n\t"
935 "addps 3072(%0, %%esi), %%xmm0 \n\t"
936 "addps 4096(%0, %%esi), %%xmm1 \n\t"
937 "movaps %%xmm0, (%0, %%esi) \n\t"
938 "movaps %%xmm1, 1024(%0, %%esi) \n\t"
939 "addl $16, %%esi \n\t"
941 :: "r" (samples
+256), "m" (bias
)
946 static void mix32toS_SSE (sample_t
* samples
, sample_t bias
)
949 "movlps %1, %%xmm7 \n\t"
950 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
951 "movl $-1024, %%esi \n\t"
954 "movaps 1024(%0, %%esi), %%xmm0 \n\t"
955 "movaps 3072(%0, %%esi), %%xmm2 \n\t"
956 "addps %%xmm7, %%xmm0 \n\t" // common
957 "addps 4096(%0, %%esi), %%xmm2 \n\t" // surround
958 "movaps (%0, %%esi), %%xmm1 \n\t"
959 "movaps 2048(%0, %%esi), %%xmm3 \n\t"
960 "subps %%xmm2, %%xmm1 \n\t"
961 "addps %%xmm2, %%xmm3 \n\t"
962 "addps %%xmm0, %%xmm1 \n\t"
963 "addps %%xmm0, %%xmm3 \n\t"
964 "movaps %%xmm1, (%0, %%esi) \n\t"
965 "movaps %%xmm3, 1024(%0, %%esi) \n\t"
966 "addl $16, %%esi \n\t"
968 :: "r" (samples
+256), "m" (bias
)
973 static void move2to1_SSE (sample_t
* src
, sample_t
* dest
, sample_t bias
)
976 "movlps %2, %%xmm7 \n\t"
977 "shufps $0x00, %%xmm7, %%xmm7 \n\t"
978 "movl $-1024, %%esi \n\t"
981 "movaps (%0, %%esi), %%xmm0 \n\t"
982 "movaps 16(%0, %%esi), %%xmm1 \n\t"
983 "addps 1024(%0, %%esi), %%xmm0 \n\t"
984 "addps 1040(%0, %%esi), %%xmm1 \n\t"
985 "addps %%xmm7, %%xmm0 \n\t"
986 "addps %%xmm7, %%xmm1 \n\t"
987 "movaps %%xmm0, (%1, %%esi) \n\t"
988 "movaps %%xmm1, 16(%1, %%esi) \n\t"
989 "addl $32, %%esi \n\t"
991 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
996 static void zero_MMX(sample_t
* samples
)
999 "movl $-1024, %%esi \n\t"
1000 "pxor %%mm0, %%mm0 \n\t"
1003 "movq %%mm0, (%0, %%esi) \n\t"
1004 "movq %%mm0, 8(%0, %%esi) \n\t"
1005 "movq %%mm0, 16(%0, %%esi) \n\t"
1006 "movq %%mm0, 24(%0, %%esi) \n\t"
1007 "addl $32, %%esi \n\t"
1010 :: "r" (samples
+256)
1016 I hope dest and src will be at least 8 byte aligned and size
1017 will devide on 8 without remain
1018 Note: untested and unused.
1020 static void copy_MMX(void *dest
,const void *src
,unsigned size
)
1027 "movq %0, %%mm0\n\t"
1028 "movq 8%0, %%mm1\n\t"
1029 "movq 16%0, %%mm2\n\t"
1030 "movq 24%0, %%mm3\n\t"
1031 "movq 32%0, %%mm4\n\t"
1032 "movq 40%0, %%mm5\n\t"
1033 "movq 48%0, %%mm6\n\t"
1034 "movq 56%0, %%mm7\n\t"
1035 "movq %%mm0, %1\n\t"
1036 "movq %%mm1, 8%1\n\t"
1037 "movq %%mm2, 16%1\n\t"
1038 "movq %%mm3, 24%1\n\t"
1039 "movq %%mm4, 32%1\n\t"
1040 "movq %%mm5, 40%1\n\t"
1041 "movq %%mm6, 48%1\n\t"
1042 "movq %%mm7, 56%1\n\t"
1044 :"m"(src
),"m"(dest
));
1048 static void downmix_SSE (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
1049 sample_t clev
, sample_t slev
)
1051 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1053 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1054 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
1057 case CONVERT (A52_CHANNEL
, A52_MONO
):
1058 case CONVERT (A52_STEREO
, A52_MONO
):
1060 mix2to1_SSE (samples
, samples
+ 256, bias
);
1063 case CONVERT (A52_2F1R
, A52_MONO
):
1066 case CONVERT (A52_3F
, A52_MONO
):
1068 mix3to1_SSE (samples
, bias
);
1071 case CONVERT (A52_3F1R
, A52_MONO
):
1074 case CONVERT (A52_2F2R
, A52_MONO
):
1077 mix4to1_SSE (samples
, bias
);
1080 case CONVERT (A52_3F2R
, A52_MONO
):
1083 mix5to1_SSE (samples
, bias
);
1086 case CONVERT (A52_MONO
, A52_DOLBY
):
1087 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1090 case CONVERT (A52_3F
, A52_STEREO
):
1091 case CONVERT (A52_3F
, A52_DOLBY
):
1093 mix3to2_SSE (samples
, bias
);
1096 case CONVERT (A52_2F1R
, A52_STEREO
):
1099 mix21to2_SSE (samples
, samples
+ 256, bias
);
1102 case CONVERT (A52_2F1R
, A52_DOLBY
):
1103 mix21toS_SSE (samples
, bias
);
1106 case CONVERT (A52_3F1R
, A52_STEREO
):
1109 mix31to2_SSE (samples
, bias
);
1112 case CONVERT (A52_3F1R
, A52_DOLBY
):
1113 mix31toS_SSE (samples
, bias
);
1116 case CONVERT (A52_2F2R
, A52_STEREO
):
1119 mix2to1_SSE (samples
, samples
+ 512, bias
);
1120 mix2to1_SSE (samples
+ 256, samples
+ 768, bias
);
1123 case CONVERT (A52_2F2R
, A52_DOLBY
):
1124 mix22toS_SSE (samples
, bias
);
1127 case CONVERT (A52_3F2R
, A52_STEREO
):
1130 mix32to2_SSE (samples
, bias
);
1133 case CONVERT (A52_3F2R
, A52_DOLBY
):
1134 mix32toS_SSE (samples
, bias
);
1137 case CONVERT (A52_3F1R
, A52_3F
):
1140 mix21to2_SSE (samples
, samples
+ 512, bias
);
1143 case CONVERT (A52_3F2R
, A52_3F
):
1146 mix2to1_SSE (samples
, samples
+ 768, bias
);
1147 mix2to1_SSE (samples
+ 512, samples
+ 1024, bias
);
1150 case CONVERT (A52_3F1R
, A52_2F1R
):
1151 mix3to2_SSE (samples
, bias
);
1152 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1155 case CONVERT (A52_2F2R
, A52_2F1R
):
1156 mix2to1_SSE (samples
+ 512, samples
+ 768, bias
);
1159 case CONVERT (A52_3F2R
, A52_2F1R
):
1160 mix3to2_SSE (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
1161 move2to1_SSE (samples
+ 768, samples
+ 512, bias
);
1164 case CONVERT (A52_3F2R
, A52_3F1R
):
1165 mix2to1_SSE (samples
+ 768, samples
+ 1024, bias
);
1168 case CONVERT (A52_2F1R
, A52_2F2R
):
1169 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1172 case CONVERT (A52_3F1R
, A52_2F2R
):
1173 mix3to2_SSE (samples
, bias
);
1174 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1177 case CONVERT (A52_3F2R
, A52_2F2R
):
1178 mix3to2_SSE (samples
, bias
);
1179 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1180 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
1183 case CONVERT (A52_3F1R
, A52_3F2R
):
1184 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1189 static void upmix_MMX (sample_t
* samples
, int acmod
, int output
)
1191 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1193 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1194 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1197 case CONVERT (A52_3F2R
, A52_MONO
):
1198 zero_MMX (samples
+ 1024);
1199 case CONVERT (A52_3F1R
, A52_MONO
):
1200 case CONVERT (A52_2F2R
, A52_MONO
):
1201 zero_MMX (samples
+ 768);
1202 case CONVERT (A52_3F
, A52_MONO
):
1203 case CONVERT (A52_2F1R
, A52_MONO
):
1204 zero_MMX (samples
+ 512);
1205 case CONVERT (A52_CHANNEL
, A52_MONO
):
1206 case CONVERT (A52_STEREO
, A52_MONO
):
1207 zero_MMX (samples
+ 256);
1210 case CONVERT (A52_3F2R
, A52_STEREO
):
1211 case CONVERT (A52_3F2R
, A52_DOLBY
):
1212 zero_MMX (samples
+ 1024);
1213 case CONVERT (A52_3F1R
, A52_STEREO
):
1214 case CONVERT (A52_3F1R
, A52_DOLBY
):
1215 zero_MMX (samples
+ 768);
1216 case CONVERT (A52_3F
, A52_STEREO
):
1217 case CONVERT (A52_3F
, A52_DOLBY
):
1219 memcpy (samples
+ 512, samples
+ 256, 256 * sizeof (sample_t
));
1220 zero_MMX (samples
+ 256);
1223 case CONVERT (A52_2F2R
, A52_STEREO
):
1224 case CONVERT (A52_2F2R
, A52_DOLBY
):
1225 zero_MMX (samples
+ 768);
1226 case CONVERT (A52_2F1R
, A52_STEREO
):
1227 case CONVERT (A52_2F1R
, A52_DOLBY
):
1228 zero_MMX (samples
+ 512);
1231 case CONVERT (A52_3F2R
, A52_3F
):
1232 zero_MMX (samples
+ 1024);
1233 case CONVERT (A52_3F1R
, A52_3F
):
1234 case CONVERT (A52_2F2R
, A52_2F1R
):
1235 zero_MMX (samples
+ 768);
1238 case CONVERT (A52_3F2R
, A52_3F1R
):
1239 zero_MMX (samples
+ 1024);
1242 case CONVERT (A52_3F2R
, A52_2F1R
):
1243 zero_MMX (samples
+ 1024);
1244 case CONVERT (A52_3F1R
, A52_2F1R
):
1246 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1249 case CONVERT (A52_3F2R
, A52_2F2R
):
1250 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1251 goto mix_31to21_MMX
;
1255 static void mix2to1_3dnow (sample_t
* dest
, sample_t
* src
, sample_t bias
)
1258 "movd %2, %%mm7 \n\t"
1259 "punpckldq %2, %%mm7 \n\t"
1260 "movl $-1024, %%esi \n\t"
1263 "movq (%0, %%esi), %%mm0 \n\t"
1264 "movq 8(%0, %%esi), %%mm1 \n\t"
1265 "movq 16(%0, %%esi), %%mm2 \n\t"
1266 "movq 24(%0, %%esi), %%mm3 \n\t"
1267 "pfadd (%1, %%esi), %%mm0 \n\t"
1268 "pfadd 8(%1, %%esi), %%mm1 \n\t"
1269 "pfadd 16(%1, %%esi), %%mm2 \n\t"
1270 "pfadd 24(%1, %%esi), %%mm3 \n\t"
1271 "pfadd %%mm7, %%mm0 \n\t"
1272 "pfadd %%mm7, %%mm1 \n\t"
1273 "pfadd %%mm7, %%mm2 \n\t"
1274 "pfadd %%mm7, %%mm3 \n\t"
1275 "movq %%mm0, (%1, %%esi) \n\t"
1276 "movq %%mm1, 8(%1, %%esi) \n\t"
1277 "movq %%mm2, 16(%1, %%esi) \n\t"
1278 "movq %%mm3, 24(%1, %%esi) \n\t"
1279 "addl $32, %%esi \n\t"
1281 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
1286 static void mix3to1_3dnow (sample_t
* samples
, sample_t bias
)
1289 "movd %1, %%mm7 \n\t"
1290 "punpckldq %1, %%mm7 \n\t"
1291 "movl $-1024, %%esi \n\t"
1294 "movq (%0, %%esi), %%mm0 \n\t"
1295 "movq 8(%0, %%esi), %%mm1 \n\t"
1296 "movq 1024(%0, %%esi), %%mm2 \n\t"
1297 "movq 1032(%0, %%esi), %%mm3 \n\t"
1298 "pfadd 2048(%0, %%esi), %%mm0 \n\t"
1299 "pfadd 2056(%0, %%esi), %%mm1 \n\t"
1300 "pfadd %%mm7, %%mm0 \n\t"
1301 "pfadd %%mm7, %%mm1 \n\t"
1302 "pfadd %%mm2, %%mm0 \n\t"
1303 "pfadd %%mm3, %%mm1 \n\t"
1304 "movq %%mm0, (%0, %%esi) \n\t"
1305 "movq %%mm1, 8(%0, %%esi) \n\t"
1306 "addl $16, %%esi \n\t"
1308 :: "r" (samples
+256), "m" (bias
)
1313 static void mix4to1_3dnow (sample_t
* samples
, sample_t bias
)
1316 "movd %1, %%mm7 \n\t"
1317 "punpckldq %1, %%mm7 \n\t"
1318 "movl $-1024, %%esi \n\t"
1321 "movq (%0, %%esi), %%mm0 \n\t"
1322 "movq 8(%0, %%esi), %%mm1 \n\t"
1323 "movq 1024(%0, %%esi), %%mm2 \n\t"
1324 "movq 1032(%0, %%esi), %%mm3 \n\t"
1325 "pfadd 2048(%0, %%esi), %%mm0 \n\t"
1326 "pfadd 2056(%0, %%esi), %%mm1 \n\t"
1327 "pfadd 3072(%0, %%esi), %%mm2 \n\t"
1328 "pfadd 3080(%0, %%esi), %%mm3 \n\t"
1329 "pfadd %%mm7, %%mm0 \n\t"
1330 "pfadd %%mm7, %%mm1 \n\t"
1331 "pfadd %%mm2, %%mm0 \n\t"
1332 "pfadd %%mm3, %%mm1 \n\t"
1333 "movq %%mm0, (%0, %%esi) \n\t"
1334 "movq %%mm1, 8(%0, %%esi) \n\t"
1335 "addl $16, %%esi \n\t"
1337 :: "r" (samples
+256), "m" (bias
)
1342 static void mix5to1_3dnow (sample_t
* samples
, sample_t bias
)
1345 "movd %1, %%mm7 \n\t"
1346 "punpckldq %1, %%mm7 \n\t"
1347 "movl $-1024, %%esi \n\t"
1350 "movq (%0, %%esi), %%mm0 \n\t"
1351 "movq 8(%0, %%esi), %%mm1 \n\t"
1352 "movq 1024(%0, %%esi), %%mm2 \n\t"
1353 "movq 1032(%0, %%esi), %%mm3 \n\t"
1354 "pfadd 2048(%0, %%esi), %%mm0 \n\t"
1355 "pfadd 2056(%0, %%esi), %%mm1 \n\t"
1356 "pfadd 3072(%0, %%esi), %%mm2 \n\t"
1357 "pfadd 3080(%0, %%esi), %%mm3 \n\t"
1358 "pfadd %%mm7, %%mm0 \n\t"
1359 "pfadd %%mm7, %%mm1 \n\t"
1360 "pfadd 4096(%0, %%esi), %%mm2 \n\t"
1361 "pfadd 4104(%0, %%esi), %%mm3 \n\t"
1362 "pfadd %%mm2, %%mm0 \n\t"
1363 "pfadd %%mm3, %%mm1 \n\t"
1364 "movq %%mm0, (%0, %%esi) \n\t"
1365 "movq %%mm1, 8(%0, %%esi) \n\t"
1366 "addl $16, %%esi \n\t"
1368 :: "r" (samples
+256), "m" (bias
)
1373 static void mix3to2_3dnow (sample_t
* samples
, sample_t bias
)
1376 "movd %1, %%mm7 \n\t"
1377 "punpckldq %1, %%mm7 \n\t"
1378 "movl $-1024, %%esi \n\t"
1381 "movq 1024(%0, %%esi), %%mm0 \n\t"
1382 "movq 1032(%0, %%esi), %%mm1 \n\t"
1383 "pfadd %%mm7, %%mm0 \n\t" //common
1384 "pfadd %%mm7, %%mm1 \n\t" //common
1385 "movq (%0, %%esi), %%mm2 \n\t"
1386 "movq 8(%0, %%esi), %%mm3 \n\t"
1387 "movq 2048(%0, %%esi), %%mm4 \n\t"
1388 "movq 2056(%0, %%esi), %%mm5 \n\t"
1389 "pfadd %%mm0, %%mm2 \n\t"
1390 "pfadd %%mm1, %%mm3 \n\t"
1391 "pfadd %%mm0, %%mm4 \n\t"
1392 "pfadd %%mm1, %%mm5 \n\t"
1393 "movq %%mm2, (%0, %%esi) \n\t"
1394 "movq %%mm3, 8(%0, %%esi) \n\t"
1395 "movq %%mm4, 1024(%0, %%esi) \n\t"
1396 "movq %%mm5, 1032(%0, %%esi) \n\t"
1397 "addl $16, %%esi \n\t"
1399 :: "r" (samples
+256), "m" (bias
)
1404 static void mix21to2_3dnow (sample_t
* left
, sample_t
* right
, sample_t bias
)
1407 "movd %2, %%mm7 \n\t"
1408 "punpckldq %2, %%mm7 \n\t"
1409 "movl $-1024, %%esi \n\t"
1412 "movq 1024(%1, %%esi), %%mm0 \n\t"
1413 "movq 1032(%1, %%esi), %%mm1 \n\t"
1414 "pfadd %%mm7, %%mm0 \n\t" //common
1415 "pfadd %%mm7, %%mm1 \n\t" //common
1416 "movq (%0, %%esi), %%mm2 \n\t"
1417 "movq 8(%0, %%esi), %%mm3 \n\t"
1418 "movq (%1, %%esi), %%mm4 \n\t"
1419 "movq 8(%1, %%esi), %%mm5 \n\t"
1420 "pfadd %%mm0, %%mm2 \n\t"
1421 "pfadd %%mm1, %%mm3 \n\t"
1422 "pfadd %%mm0, %%mm4 \n\t"
1423 "pfadd %%mm1, %%mm5 \n\t"
1424 "movq %%mm2, (%0, %%esi) \n\t"
1425 "movq %%mm3, 8(%0, %%esi) \n\t"
1426 "movq %%mm4, (%1, %%esi) \n\t"
1427 "movq %%mm5, 8(%1, %%esi) \n\t"
1428 "addl $16, %%esi \n\t"
1430 :: "r" (left
+256), "r" (right
+256), "m" (bias
)
1435 static void mix21toS_3dnow (sample_t
* samples
, sample_t bias
)
1438 "movd %1, %%mm7 \n\t"
1439 "punpckldq %1, %%mm7 \n\t"
1440 "movl $-1024, %%esi \n\t"
1443 "movq 2048(%0, %%esi), %%mm0 \n\t" // surround
1444 "movq 2056(%0, %%esi), %%mm1 \n\t" // surround
1445 "movq (%0, %%esi), %%mm2 \n\t"
1446 "movq 8(%0, %%esi), %%mm3 \n\t"
1447 "movq 1024(%0, %%esi), %%mm4 \n\t"
1448 "movq 1032(%0, %%esi), %%mm5 \n\t"
1449 "pfadd %%mm7, %%mm2 \n\t"
1450 "pfadd %%mm7, %%mm3 \n\t"
1451 "pfadd %%mm7, %%mm4 \n\t"
1452 "pfadd %%mm7, %%mm5 \n\t"
1453 "pfsub %%mm0, %%mm2 \n\t"
1454 "pfsub %%mm1, %%mm3 \n\t"
1455 "pfadd %%mm0, %%mm4 \n\t"
1456 "pfadd %%mm1, %%mm5 \n\t"
1457 "movq %%mm2, (%0, %%esi) \n\t"
1458 "movq %%mm3, 8(%0, %%esi) \n\t"
1459 "movq %%mm4, 1024(%0, %%esi) \n\t"
1460 "movq %%mm5, 1032(%0, %%esi) \n\t"
1461 "addl $16, %%esi \n\t"
1463 :: "r" (samples
+256), "m" (bias
)
1468 static void mix31to2_3dnow (sample_t
* samples
, sample_t bias
)
1471 "movd %1, %%mm7 \n\t"
1472 "punpckldq %1, %%mm7 \n\t"
1473 "movl $-1024, %%esi \n\t"
1476 "movq 1024(%0, %%esi), %%mm0 \n\t"
1477 "movq 1032(%0, %%esi), %%mm1 \n\t"
1478 "pfadd 3072(%0, %%esi), %%mm0 \n\t"
1479 "pfadd 3080(%0, %%esi), %%mm1 \n\t"
1480 "pfadd %%mm7, %%mm0 \n\t" // common
1481 "pfadd %%mm7, %%mm1 \n\t" // common
1482 "movq (%0, %%esi), %%mm2 \n\t"
1483 "movq 8(%0, %%esi), %%mm3 \n\t"
1484 "movq 2048(%0, %%esi), %%mm4 \n\t"
1485 "movq 2056(%0, %%esi), %%mm5 \n\t"
1486 "pfadd %%mm0, %%mm2 \n\t"
1487 "pfadd %%mm1, %%mm3 \n\t"
1488 "pfadd %%mm0, %%mm4 \n\t"
1489 "pfadd %%mm1, %%mm5 \n\t"
1490 "movq %%mm2, (%0, %%esi) \n\t"
1491 "movq %%mm3, 8(%0, %%esi) \n\t"
1492 "movq %%mm4, 1024(%0, %%esi) \n\t"
1493 "movq %%mm5, 1032(%0, %%esi) \n\t"
1494 "addl $16, %%esi \n\t"
1496 :: "r" (samples
+256), "m" (bias
)
1501 static void mix31toS_3dnow (sample_t
* samples
, sample_t bias
)
1504 "movd %1, %%mm7 \n\t"
1505 "punpckldq %1, %%mm7 \n\t"
1506 "movl $-1024, %%esi \n\t"
1509 "movq 1024(%0, %%esi), %%mm0 \n\t"
1510 "movq 1032(%0, %%esi), %%mm1 \n\t"
1511 "pfadd %%mm7, %%mm0 \n\t" // common
1512 "pfadd %%mm7, %%mm1 \n\t" // common
1513 "movq (%0, %%esi), %%mm2 \n\t"
1514 "movq 8(%0, %%esi), %%mm3 \n\t"
1515 "movq 2048(%0, %%esi), %%mm4 \n\t"
1516 "movq 2056(%0, %%esi), %%mm5 \n\t"
1517 "pfadd %%mm0, %%mm2 \n\t"
1518 "pfadd %%mm1, %%mm3 \n\t"
1519 "pfadd %%mm0, %%mm4 \n\t"
1520 "pfadd %%mm1, %%mm5 \n\t"
1521 "movq 3072(%0, %%esi), %%mm0 \n\t" // surround
1522 "movq 3080(%0, %%esi), %%mm1 \n\t" // surround
1523 "pfsub %%mm0, %%mm2 \n\t"
1524 "pfsub %%mm1, %%mm3 \n\t"
1525 "pfadd %%mm0, %%mm4 \n\t"
1526 "pfadd %%mm1, %%mm5 \n\t"
1527 "movq %%mm2, (%0, %%esi) \n\t"
1528 "movq %%mm3, 8(%0, %%esi) \n\t"
1529 "movq %%mm4, 1024(%0, %%esi) \n\t"
1530 "movq %%mm5, 1032(%0, %%esi) \n\t"
1531 "addl $16, %%esi \n\t"
1533 :: "r" (samples
+256), "m" (bias
)
1538 static void mix22toS_3dnow (sample_t
* samples
, sample_t bias
)
1541 "movd %1, %%mm7 \n\t"
1542 "punpckldq %1, %%mm7 \n\t"
1543 "movl $-1024, %%esi \n\t"
1546 "movq 2048(%0, %%esi), %%mm0 \n\t"
1547 "movq 2056(%0, %%esi), %%mm1 \n\t"
1548 "pfadd 3072(%0, %%esi), %%mm0 \n\t" // surround
1549 "pfadd 3080(%0, %%esi), %%mm1 \n\t" // surround
1550 "movq (%0, %%esi), %%mm2 \n\t"
1551 "movq 8(%0, %%esi), %%mm3 \n\t"
1552 "movq 1024(%0, %%esi), %%mm4 \n\t"
1553 "movq 1032(%0, %%esi), %%mm5 \n\t"
1554 "pfadd %%mm7, %%mm2 \n\t"
1555 "pfadd %%mm7, %%mm3 \n\t"
1556 "pfadd %%mm7, %%mm4 \n\t"
1557 "pfadd %%mm7, %%mm5 \n\t"
1558 "pfsub %%mm0, %%mm2 \n\t"
1559 "pfsub %%mm1, %%mm3 \n\t"
1560 "pfadd %%mm0, %%mm4 \n\t"
1561 "pfadd %%mm1, %%mm5 \n\t"
1562 "movq %%mm2, (%0, %%esi) \n\t"
1563 "movq %%mm3, 8(%0, %%esi) \n\t"
1564 "movq %%mm4, 1024(%0, %%esi) \n\t"
1565 "movq %%mm5, 1032(%0, %%esi) \n\t"
1566 "addl $16, %%esi \n\t"
1568 :: "r" (samples
+256), "m" (bias
)
1573 static void mix32to2_3dnow (sample_t
* samples
, sample_t bias
)
1576 "movd %1, %%mm7 \n\t"
1577 "punpckldq %1, %%mm7 \n\t"
1578 "movl $-1024, %%esi \n\t"
1581 "movq 1024(%0, %%esi), %%mm0 \n\t"
1582 "movq 1032(%0, %%esi), %%mm1 \n\t"
1583 "pfadd %%mm7, %%mm0 \n\t" // common
1584 "pfadd %%mm7, %%mm1 \n\t" // common
1585 "movq %%mm0, %%mm2 \n\t" // common
1586 "movq %%mm1, %%mm3 \n\t" // common
1587 "pfadd (%0, %%esi), %%mm0 \n\t"
1588 "pfadd 8(%0, %%esi), %%mm1 \n\t"
1589 "pfadd 2048(%0, %%esi), %%mm2 \n\t"
1590 "pfadd 2056(%0, %%esi), %%mm3 \n\t"
1591 "pfadd 3072(%0, %%esi), %%mm0 \n\t"
1592 "pfadd 3080(%0, %%esi), %%mm1 \n\t"
1593 "pfadd 4096(%0, %%esi), %%mm2 \n\t"
1594 "pfadd 4104(%0, %%esi), %%mm3 \n\t"
1595 "movq %%mm0, (%0, %%esi) \n\t"
1596 "movq %%mm1, 8(%0, %%esi) \n\t"
1597 "movq %%mm2, 1024(%0, %%esi) \n\t"
1598 "movq %%mm3, 1032(%0, %%esi) \n\t"
1599 "addl $16, %%esi \n\t"
1601 :: "r" (samples
+256), "m" (bias
)
1606 /* todo: should be optimized better */
1607 static void mix32toS_3dnow (sample_t
* samples
, sample_t bias
)
1610 "movl $-1024, %%esi \n\t"
1613 "movd %1, %%mm7 \n\t"
1614 "punpckldq %1, %%mm7 \n\t"
1615 "movq 1024(%0, %%esi), %%mm0 \n\t"
1616 "movq 1032(%0, %%esi), %%mm1 \n\t"
1617 "movq 3072(%0, %%esi), %%mm4 \n\t"
1618 "movq 3080(%0, %%esi), %%mm5 \n\t"
1619 "pfadd %%mm7, %%mm0 \n\t" // common
1620 "pfadd %%mm7, %%mm1 \n\t" // common
1621 "pfadd 4096(%0, %%esi), %%mm4 \n\t" // surround
1622 "pfadd 4104(%0, %%esi), %%mm5 \n\t" // surround
1623 "movq (%0, %%esi), %%mm2 \n\t"
1624 "movq 8(%0, %%esi), %%mm3 \n\t"
1625 "movq 2048(%0, %%esi), %%mm6 \n\t"
1626 "movq 2056(%0, %%esi), %%mm7 \n\t"
1627 "pfsub %%mm4, %%mm2 \n\t"
1628 "pfsub %%mm5, %%mm3 \n\t"
1629 "pfadd %%mm4, %%mm6 \n\t"
1630 "pfadd %%mm5, %%mm7 \n\t"
1631 "pfadd %%mm0, %%mm2 \n\t"
1632 "pfadd %%mm1, %%mm3 \n\t"
1633 "pfadd %%mm0, %%mm6 \n\t"
1634 "pfadd %%mm1, %%mm7 \n\t"
1635 "movq %%mm2, (%0, %%esi) \n\t"
1636 "movq %%mm3, 8(%0, %%esi) \n\t"
1637 "movq %%mm6, 1024(%0, %%esi) \n\t"
1638 "movq %%mm7, 1032(%0, %%esi) \n\t"
1639 "addl $16, %%esi \n\t"
1641 :: "r" (samples
+256), "m" (bias
)
1646 static void move2to1_3dnow (sample_t
* src
, sample_t
* dest
, sample_t bias
)
1649 "movd %2, %%mm7 \n\t"
1650 "punpckldq %2, %%mm7 \n\t"
1651 "movl $-1024, %%esi \n\t"
1654 "movq (%0, %%esi), %%mm0 \n\t"
1655 "movq 8(%0, %%esi), %%mm1 \n\t"
1656 "movq 16(%0, %%esi), %%mm2 \n\t"
1657 "movq 24(%0, %%esi), %%mm3 \n\t"
1658 "pfadd 1024(%0, %%esi), %%mm0 \n\t"
1659 "pfadd 1032(%0, %%esi), %%mm1 \n\t"
1660 "pfadd 1040(%0, %%esi), %%mm2 \n\t"
1661 "pfadd 1048(%0, %%esi), %%mm3 \n\t"
1662 "pfadd %%mm7, %%mm0 \n\t"
1663 "pfadd %%mm7, %%mm1 \n\t"
1664 "pfadd %%mm7, %%mm2 \n\t"
1665 "pfadd %%mm7, %%mm3 \n\t"
1666 "movq %%mm0, (%1, %%esi) \n\t"
1667 "movq %%mm1, 8(%1, %%esi) \n\t"
1668 "movq %%mm2, 16(%1, %%esi) \n\t"
1669 "movq %%mm3, 24(%1, %%esi) \n\t"
1670 "addl $32, %%esi \n\t"
1672 :: "r" (src
+256), "r" (dest
+256), "m" (bias
)
1677 static void downmix_3dnow (sample_t
* samples
, int acmod
, int output
, sample_t bias
,
1678 sample_t clev
, sample_t slev
)
1680 switch (CONVERT (acmod
, output
& A52_CHANNEL_MASK
)) {
1682 case CONVERT (A52_CHANNEL
, A52_CHANNEL2
):
1683 memcpy (samples
, samples
+ 256, 256 * sizeof (sample_t
));
1686 case CONVERT (A52_CHANNEL
, A52_MONO
):
1687 case CONVERT (A52_STEREO
, A52_MONO
):
1689 mix2to1_3dnow (samples
, samples
+ 256, bias
);
1692 case CONVERT (A52_2F1R
, A52_MONO
):
1694 goto mix_2to1_3dnow
;
1695 case CONVERT (A52_3F
, A52_MONO
):
1697 mix3to1_3dnow (samples
, bias
);
1700 case CONVERT (A52_3F1R
, A52_MONO
):
1702 goto mix_3to1_3dnow
;
1703 case CONVERT (A52_2F2R
, A52_MONO
):
1705 goto mix_2to1_3dnow
;
1706 mix4to1_3dnow (samples
, bias
);
1709 case CONVERT (A52_3F2R
, A52_MONO
):
1711 goto mix_3to1_3dnow
;
1712 mix5to1_3dnow (samples
, bias
);
1715 case CONVERT (A52_MONO
, A52_DOLBY
):
1716 memcpy (samples
+ 256, samples
, 256 * sizeof (sample_t
));
1719 case CONVERT (A52_3F
, A52_STEREO
):
1720 case CONVERT (A52_3F
, A52_DOLBY
):
1722 mix3to2_3dnow (samples
, bias
);
1725 case CONVERT (A52_2F1R
, A52_STEREO
):
1728 mix21to2_3dnow (samples
, samples
+ 256, bias
);
1731 case CONVERT (A52_2F1R
, A52_DOLBY
):
1732 mix21toS_3dnow (samples
, bias
);
1735 case CONVERT (A52_3F1R
, A52_STEREO
):
1737 goto mix_3to2_3dnow
;
1738 mix31to2_3dnow (samples
, bias
);
1741 case CONVERT (A52_3F1R
, A52_DOLBY
):
1742 mix31toS_3dnow (samples
, bias
);
1745 case CONVERT (A52_2F2R
, A52_STEREO
):
1748 mix2to1_3dnow (samples
, samples
+ 512, bias
);
1749 mix2to1_3dnow (samples
+ 256, samples
+ 768, bias
);
1752 case CONVERT (A52_2F2R
, A52_DOLBY
):
1753 mix22toS_3dnow (samples
, bias
);
1756 case CONVERT (A52_3F2R
, A52_STEREO
):
1758 goto mix_3to2_3dnow
;
1759 mix32to2_3dnow (samples
, bias
);
1762 case CONVERT (A52_3F2R
, A52_DOLBY
):
1763 mix32toS_3dnow (samples
, bias
);
1766 case CONVERT (A52_3F1R
, A52_3F
):
1769 mix21to2_3dnow (samples
, samples
+ 512, bias
);
1772 case CONVERT (A52_3F2R
, A52_3F
):
1775 mix2to1_3dnow (samples
, samples
+ 768, bias
);
1776 mix2to1_3dnow (samples
+ 512, samples
+ 1024, bias
);
1779 case CONVERT (A52_3F1R
, A52_2F1R
):
1780 mix3to2_3dnow (samples
, bias
);
1781 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1784 case CONVERT (A52_2F2R
, A52_2F1R
):
1785 mix2to1_3dnow (samples
+ 512, samples
+ 768, bias
);
1788 case CONVERT (A52_3F2R
, A52_2F1R
):
1789 mix3to2_3dnow (samples
, bias
); //FIXME possible bug? (output doesnt seem to be used)
1790 move2to1_3dnow (samples
+ 768, samples
+ 512, bias
);
1793 case CONVERT (A52_3F2R
, A52_3F1R
):
1794 mix2to1_3dnow (samples
+ 768, samples
+ 1024, bias
);
1797 case CONVERT (A52_2F1R
, A52_2F2R
):
1798 memcpy (samples
+ 768, samples
+ 512, 256 * sizeof (sample_t
));
1801 case CONVERT (A52_3F1R
, A52_2F2R
):
1802 mix3to2_3dnow (samples
, bias
);
1803 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1806 case CONVERT (A52_3F2R
, A52_2F2R
):
1807 mix3to2_3dnow (samples
, bias
);
1808 memcpy (samples
+ 512, samples
+ 768, 256 * sizeof (sample_t
));
1809 memcpy (samples
+ 768, samples
+ 1024, 256 * sizeof (sample_t
));
1812 case CONVERT (A52_3F1R
, A52_3F2R
):
1813 memcpy (samples
+ 1024, samples
+ 768, 256 * sizeof (sample_t
));
1816 __asm
__volatile("femms":::"memory");