2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version.
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 ** Any non-GPL usage of this software or parts of this software is strictly
22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
25 ** Initially modified for use with MPlayer by Arpad Gereöffy on 2003/08/30
26 ** $Id: sbr_qmf.c,v 1.3 2004/06/02 22:59:03 diego Exp $
27 ** detailed CVS changelog at http://www.mplayerhq.hu/cgi-bin/cvsweb.cgi/main/
40 #include "sbr_qmf_c.h"
41 #include "sbr_syntax.h"
44 qmfa_info
*qmfa_init(uint8_t channels
)
46 qmfa_info
*qmfa
= (qmfa_info
*)faad_malloc(sizeof(qmfa_info
));
47 qmfa
->x
= (real_t
*)faad_malloc(channels
* 10 * sizeof(real_t
));
48 memset(qmfa
->x
, 0, channels
* 10 * sizeof(real_t
));
50 qmfa
->channels
= channels
;
55 void qmfa_end(qmfa_info
*qmfa
)
59 if (qmfa
->x
) faad_free(qmfa
->x
);
64 void sbr_qmf_analysis_32(sbr_info
*sbr
, qmfa_info
*qmfa
, const real_t
*input
,
65 qmf_t X
[MAX_NTSRHFG
][32], uint8_t offset
, uint8_t kx
)
69 ALIGN real_t x
[64], y
[64];
77 for (l
= 0; l
< sbr
->numTimeSlotsRate
; l
++)
81 /* shift input buffer x */
82 memmove(qmfa
->x
+ 32, qmfa
->x
, (320-32)*sizeof(real_t
));
84 /* add new samples to input buffer x */
85 for (n
= 32 - 1; n
>= 0; n
--)
88 qmfa
->x
[n
] = (input
[in
++]) >> 5;
90 qmfa
->x
[n
] = input
[in
++];
94 /* window and summation to create array u */
95 for (n
= 0; n
< 64; n
++)
97 u
[n
] = MUL_F(qmfa
->x
[n
], qmf_c
[2*n
]) +
98 MUL_F(qmfa
->x
[n
+ 64], qmf_c
[2*(n
+ 64)]) +
99 MUL_F(qmfa
->x
[n
+ 128], qmf_c
[2*(n
+ 128)]) +
100 MUL_F(qmfa
->x
[n
+ 192], qmf_c
[2*(n
+ 192)]) +
101 MUL_F(qmfa
->x
[n
+ 256], qmf_c
[2*(n
+ 256)]);
104 /* calculate 32 subband samples by introducing X */
107 for (n
= 1; n
< 16; n
++)
108 y
[n
] = u
[n
+48] + u
[48-n
];
109 for (n
= 16; n
< 32; n
++)
110 y
[n
] = -u
[n
-16] + u
[48-n
];
112 DCT3_32_unscaled(u
, y
);
114 for (n
= 0; n
< 32; n
++)
119 QMF_RE(X
[l
+ offset
][n
]) = u
[n
] << 1;
121 QMF_RE(X
[l
+ offset
][n
]) = 2. * u
[n
];
124 QMF_RE(X
[l
+ offset
][n
]) = 0;
129 for (n
= 0; n
< 31; n
++)
131 x
[2*n
+1] = u
[n
+1] + u
[63-n
];
132 x
[2*n
+2] = u
[n
+1] - u
[63-n
];
136 DCT4_64_kernel(y
, x
);
138 for (n
= 0; n
< 32; n
++)
143 QMF_RE(X
[l
+ offset
][n
]) = y
[n
] << 1;
144 QMF_IM(X
[l
+ offset
][n
]) = -y
[63-n
] << 1;
146 QMF_RE(X
[l
+ offset
][n
]) = 2. * y
[n
];
147 QMF_IM(X
[l
+ offset
][n
]) = -2. * y
[63-n
];
150 QMF_RE(X
[l
+ offset
][n
]) = 0;
151 QMF_IM(X
[l
+ offset
][n
]) = 0;
158 qmfs_info
*qmfs_init(uint8_t channels
)
160 qmfs_info
*qmfs
= (qmfs_info
*)faad_malloc(sizeof(qmfs_info
));
162 #ifndef SBR_LOW_POWER
163 qmfs
->v
[0] = (real_t
*)faad_malloc(channels
* 10 * sizeof(real_t
));
164 memset(qmfs
->v
[0], 0, channels
* 10 * sizeof(real_t
));
165 qmfs
->v
[1] = (real_t
*)faad_malloc(channels
* 10 * sizeof(real_t
));
166 memset(qmfs
->v
[1], 0, channels
* 10 * sizeof(real_t
));
168 qmfs
->v
[0] = (real_t
*)faad_malloc(channels
* 20 * sizeof(real_t
));
169 memset(qmfs
->v
[0], 0, channels
* 20 * sizeof(real_t
));
175 qmfs
->channels
= channels
;
180 qmfs
->qmf_func
= sbr_qmf_synthesis_64_sse
;
182 qmfs
->qmf_func
= sbr_qmf_synthesis_64
;
189 void qmfs_end(qmfs_info
*qmfs
)
193 if (qmfs
->v
[0]) faad_free(qmfs
->v
[0]);
194 #ifndef SBR_LOW_POWER
195 if (qmfs
->v
[1]) faad_free(qmfs
->v
[1]);
202 void sbr_qmf_synthesis_64(sbr_info
*sbr
, qmfs_info
*qmfs
, qmf_t X
[MAX_NTSRHFG
][64],
207 int16_t n
, k
, out
= 0;
211 /* qmf subsample l */
212 for (l
= 0; l
< sbr
->numTimeSlotsRate
; l
++)
217 //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
218 //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
219 memmove(qmfs
->v
[0] + 128, qmfs
->v
[0], (1280-128)*sizeof(real_t
));
221 //v0 = qmfs->v[qmfs->v_index];
222 //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
223 //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
225 /* calculate 128 samples */
226 for (k
= 0; k
< 64; k
++)
229 x
[k
] = QMF_RE(X
[l
][k
]);
231 x
[k
] = QMF_RE(X
[l
][k
]) / 32.;
235 for (n
= 0; n
< 32; n
++)
241 DCT2_64_unscaled(x
, x
);
243 for (n
= 0; n
< 64; n
++)
245 qmfs
->v
[0][n
+32] = x
[n
];
247 for (n
= 0; n
< 32; n
++)
249 qmfs
->v
[0][31 - n
] = x
[n
+ 1];
251 DST2_64_unscaled(x
, y
);
253 for (n
= 1; n
< 32; n
++)
255 qmfs
->v
[0][n
+ 96] = x
[n
-1];
258 /* calculate 64 output samples and window */
259 for (k
= 0; k
< 64; k
++)
262 output
[out
++] = MUL_F(qmfs
->v
[0][k
], qmf_c
[k
]) +
263 MUL_F(qmfs
->v
[0][192 + k
], qmf_c
[64 + k
]) +
264 MUL_F(qmfs
->v
[0][256 + k
], qmf_c
[128 + k
]) +
265 MUL_F(qmfs
->v
[0][256 + 192 + k
], qmf_c
[128 + 64 + k
]) +
266 MUL_F(qmfs
->v
[0][512 + k
], qmf_c
[256 + k
]) +
267 MUL_F(qmfs
->v
[0][512 + 192 + k
], qmf_c
[256 + 64 + k
]) +
268 MUL_F(qmfs
->v
[0][768 + k
], qmf_c
[384 + k
]) +
269 MUL_F(qmfs
->v
[0][768 + 192 + k
], qmf_c
[384 + 64 + k
]) +
270 MUL_F(qmfs
->v
[0][1024 + k
], qmf_c
[512 + k
]) +
271 MUL_F(qmfs
->v
[0][1024 + 192 + k
], qmf_c
[512 + 64 + k
]);
273 output
[out
++] = MUL_F(v0
[k
], qmf_c
[k
]) +
274 MUL_F(v0
[64 + k
], qmf_c
[64 + k
]) +
275 MUL_F(v0
[128 + k
], qmf_c
[128 + k
]) +
276 MUL_F(v0
[192 + k
], qmf_c
[192 + k
]) +
277 MUL_F(v0
[256 + k
], qmf_c
[256 + k
]) +
278 MUL_F(v0
[320 + k
], qmf_c
[320 + k
]) +
279 MUL_F(v0
[384 + k
], qmf_c
[384 + k
]) +
280 MUL_F(v0
[448 + k
], qmf_c
[448 + k
]) +
281 MUL_F(v0
[512 + k
], qmf_c
[512 + k
]) +
282 MUL_F(v0
[576 + k
], qmf_c
[576 + k
]);
288 void sbr_qmf_synthesis_64_sse(sbr_info
*sbr
, qmfs_info
*qmfs
, qmf_t X
[MAX_NTSRHFG
][64],
294 int16_t n
, k
, out
= 0;
297 /* qmf subsample l */
298 for (l
= 0; l
< sbr
->numTimeSlotsRate
; l
++)
303 //memmove(qmfs->v[0] + 64, qmfs->v[0], (640-64)*sizeof(real_t));
304 //memmove(qmfs->v[1] + 64, qmfs->v[1], (640-64)*sizeof(real_t));
305 memmove(qmfs
->v
[0] + 128, qmfs
->v
[0], (1280-128)*sizeof(real_t
));
307 //v0 = qmfs->v[qmfs->v_index];
308 //v1 = qmfs->v[(qmfs->v_index + 1) & 0x1];
309 //qmfs->v_index = (qmfs->v_index + 1) & 0x1;
311 /* calculate 128 samples */
312 for (k
= 0; k
< 64; k
++)
315 x
[k
] = QMF_RE(X
[l
][k
]);
317 x
[k
] = QMF_RE(X
[l
][k
]) / 32.;
321 for (n
= 0; n
< 32; n
++)
327 DCT2_64_unscaled(x
, x
);
329 for (n
= 0; n
< 64; n
++)
331 qmfs
->v
[0][n
+32] = x
[n
];
333 for (n
= 0; n
< 32; n
++)
335 qmfs
->v
[0][31 - n
] = x
[n
+ 1];
338 DST2_64_unscaled(x
, y
);
340 for (n
= 1; n
< 32; n
++)
342 qmfs
->v
[0][n
+ 96] = x
[n
-1];
345 /* calculate 64 output samples and window */
346 for (k
= 0; k
< 64; k
++)
349 output
[out
++] = MUL_F(qmfs
->v
[0][k
], qmf_c
[k
]) +
350 MUL_F(qmfs
->v
[0][192 + k
], qmf_c
[64 + k
]) +
351 MUL_F(qmfs
->v
[0][256 + k
], qmf_c
[128 + k
]) +
352 MUL_F(qmfs
->v
[0][256 + 192 + k
], qmf_c
[128 + 64 + k
]) +
353 MUL_F(qmfs
->v
[0][512 + k
], qmf_c
[256 + k
]) +
354 MUL_F(qmfs
->v
[0][512 + 192 + k
], qmf_c
[256 + 64 + k
]) +
355 MUL_F(qmfs
->v
[0][768 + k
], qmf_c
[384 + k
]) +
356 MUL_F(qmfs
->v
[0][768 + 192 + k
], qmf_c
[384 + 64 + k
]) +
357 MUL_F(qmfs
->v
[0][1024 + k
], qmf_c
[512 + k
]) +
358 MUL_F(qmfs
->v
[0][1024 + 192 + k
], qmf_c
[512 + 64 + k
]);
360 output
[out
++] = MUL_F(v0
[k
], qmf_c
[k
]) +
361 MUL_F(v0
[64 + k
], qmf_c
[64 + k
]) +
362 MUL_F(v0
[128 + k
], qmf_c
[128 + k
]) +
363 MUL_F(v0
[192 + k
], qmf_c
[192 + k
]) +
364 MUL_F(v0
[256 + k
], qmf_c
[256 + k
]) +
365 MUL_F(v0
[320 + k
], qmf_c
[320 + k
]) +
366 MUL_F(v0
[384 + k
], qmf_c
[384 + k
]) +
367 MUL_F(v0
[448 + k
], qmf_c
[448 + k
]) +
368 MUL_F(v0
[512 + k
], qmf_c
[512 + k
]) +
369 MUL_F(v0
[576 + k
], qmf_c
[576 + k
]);
375 void sbr_qmf_synthesis_64(sbr_info
*sbr
, qmfs_info
*qmfs
, qmf_t X
[MAX_NTSRHFG
][64],
378 ALIGN real_t x1
[64], x2
[64];
379 real_t scale
= 1.f
/64.f
;
380 int16_t n
, k
, out
= 0;
384 /* qmf subsample l */
385 for (l
= 0; l
< sbr
->numTimeSlotsRate
; l
++)
390 memmove(qmfs
->v
[0] + 64, qmfs
->v
[0], (640-64)*sizeof(real_t
));
391 memmove(qmfs
->v
[1] + 64, qmfs
->v
[1], (640-64)*sizeof(real_t
));
393 v0
= qmfs
->v
[qmfs
->v_index
];
394 v1
= qmfs
->v
[(qmfs
->v_index
+ 1) & 0x1];
395 qmfs
->v_index
= (qmfs
->v_index
+ 1) & 0x1;
397 /* calculate 128 samples */
398 x1
[0] = scale
*QMF_RE(X
[l
][0]);
399 x2
[63] = scale
*QMF_IM(X
[l
][0]);
400 for (k
= 0; k
< 31; k
++)
402 x1
[2*k
+1] = scale
*(QMF_RE(X
[l
][2*k
+1]) - QMF_RE(X
[l
][2*k
+2]));
403 x1
[2*k
+2] = scale
*(QMF_RE(X
[l
][2*k
+1]) + QMF_RE(X
[l
][2*k
+2]));
405 x2
[61 - 2*k
] = scale
*(QMF_IM(X
[l
][2*k
+2]) - QMF_IM(X
[l
][2*k
+1]));
406 x2
[62 - 2*k
] = scale
*(QMF_IM(X
[l
][2*k
+2]) + QMF_IM(X
[l
][2*k
+1]));
408 x1
[63] = scale
*QMF_RE(X
[l
][63]);
409 x2
[0] = scale
*QMF_IM(X
[l
][63]);
411 DCT4_64_kernel(x1
, x1
);
412 DCT4_64_kernel(x2
, x2
);
414 for (n
= 0; n
< 32; n
++)
416 v0
[ 2*n
] = x2
[2*n
] - x1
[2*n
];
417 v1
[63-2*n
] = x2
[2*n
] + x1
[2*n
];
418 v0
[ 2*n
+1] = -x2
[2*n
+1] - x1
[2*n
+1];
419 v1
[62-2*n
] = -x2
[2*n
+1] + x1
[2*n
+1];
422 /* calculate 64 output samples and window */
423 for (k
= 0; k
< 64; k
++)
425 output
[out
++] = MUL_F(v0
[k
], qmf_c
[k
]) +
426 MUL_F(v0
[64 + k
], qmf_c
[64 + k
]) +
427 MUL_F(v0
[128 + k
], qmf_c
[128 + k
]) +
428 MUL_F(v0
[192 + k
], qmf_c
[192 + k
]) +
429 MUL_F(v0
[256 + k
], qmf_c
[256 + k
]) +
430 MUL_F(v0
[320 + k
], qmf_c
[320 + k
]) +
431 MUL_F(v0
[384 + k
], qmf_c
[384 + k
]) +
432 MUL_F(v0
[448 + k
], qmf_c
[448 + k
]) +
433 MUL_F(v0
[512 + k
], qmf_c
[512 + k
]) +
434 MUL_F(v0
[576 + k
], qmf_c
[576 + k
]);
440 void memmove_sse_576(real_t
*out
, const real_t
*in
)
445 for (i
= 0; i
< 144; i
++)
447 m
[i
] = _mm_load_ps(&in
[i
*4]);
449 for (i
= 0; i
< 144; i
++)
451 _mm_store_ps(&out
[i
*4], m
[i
]);
455 void sbr_qmf_synthesis_64_sse(sbr_info
*sbr
, qmfs_info
*qmfs
, qmf_t X
[MAX_NTSRHFG
][64],
458 ALIGN real_t x1
[64], x2
[64];
459 real_t scale
= 1.f
/64.f
;
460 int16_t n
, k
, out
= 0;
464 /* qmf subsample l */
465 for (l
= 0; l
< sbr
->numTimeSlotsRate
; l
++)
470 memmove_sse_576(qmfs
->v
[0] + 64, qmfs
->v
[0]);
471 memmove_sse_576(qmfs
->v
[1] + 64, qmfs
->v
[1]);
473 v0
= qmfs
->v
[qmfs
->v_index
];
474 v1
= qmfs
->v
[(qmfs
->v_index
+ 1) & 0x1];
475 qmfs
->v_index
= (qmfs
->v_index
+ 1) & 0x1;
477 /* calculate 128 samples */
478 x1
[0] = scale
*QMF_RE(X
[l
][0]);
479 x2
[63] = scale
*QMF_IM(X
[l
][0]);
480 for (k
= 0; k
< 31; k
++)
482 x1
[2*k
+1] = scale
*(QMF_RE(X
[l
][2*k
+1]) - QMF_RE(X
[l
][2*k
+2]));
483 x1
[2*k
+2] = scale
*(QMF_RE(X
[l
][2*k
+1]) + QMF_RE(X
[l
][2*k
+2]));
485 x2
[61 - 2*k
] = scale
*(QMF_IM(X
[l
][2*k
+2]) - QMF_IM(X
[l
][2*k
+1]));
486 x2
[62 - 2*k
] = scale
*(QMF_IM(X
[l
][2*k
+2]) + QMF_IM(X
[l
][2*k
+1]));
488 x1
[63] = scale
*QMF_RE(X
[l
][63]);
489 x2
[0] = scale
*QMF_IM(X
[l
][63]);
491 DCT4_64_kernel(x1
, x1
);
492 DCT4_64_kernel(x2
, x2
);
494 for (n
= 0; n
< 32; n
++)
496 v0
[ 2*n
] = x2
[2*n
] - x1
[2*n
];
497 v1
[63- 2*n
] = x2
[2*n
] + x1
[2*n
];
498 v0
[ 2*n
+1 ] = -x2
[2*n
+1] - x1
[2*n
+1];
499 v1
[63-(2*n
+1)] = -x2
[2*n
+1] + x1
[2*n
+1];
502 /* calculate 64 output samples and window */
503 for (k
= 0; k
< 64; k
+=4)
505 __m128 m0
, m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
, m9
;
506 __m128 c0
, c1
, c2
, c3
, c4
, c5
, c6
, c7
, c8
, c9
;
507 __m128 s1
, s2
, s3
, s4
, s5
, s6
, s7
, s8
, s9
;
509 m0
= _mm_load_ps(&v0
[k
]);
510 m1
= _mm_load_ps(&v0
[k
+ 64]);
511 m2
= _mm_load_ps(&v0
[k
+ 128]);
512 m3
= _mm_load_ps(&v0
[k
+ 192]);
513 m4
= _mm_load_ps(&v0
[k
+ 256]);
514 c0
= _mm_load_ps(&qmf_c
[k
]);
515 c1
= _mm_load_ps(&qmf_c
[k
+ 64]);
516 c2
= _mm_load_ps(&qmf_c
[k
+ 128]);
517 c3
= _mm_load_ps(&qmf_c
[k
+ 192]);
518 c4
= _mm_load_ps(&qmf_c
[k
+ 256]);
520 m0
= _mm_mul_ps(m0
, c0
);
521 m1
= _mm_mul_ps(m1
, c1
);
522 m2
= _mm_mul_ps(m2
, c2
);
523 m3
= _mm_mul_ps(m3
, c3
);
524 m4
= _mm_mul_ps(m4
, c4
);
526 s1
= _mm_add_ps(m0
, m1
);
527 s2
= _mm_add_ps(m2
, m3
);
528 s6
= _mm_add_ps(s1
, s2
);
530 m5
= _mm_load_ps(&v0
[k
+ 320]);
531 m6
= _mm_load_ps(&v0
[k
+ 384]);
532 m7
= _mm_load_ps(&v0
[k
+ 448]);
533 m8
= _mm_load_ps(&v0
[k
+ 512]);
534 m9
= _mm_load_ps(&v0
[k
+ 576]);
535 c5
= _mm_load_ps(&qmf_c
[k
+ 320]);
536 c6
= _mm_load_ps(&qmf_c
[k
+ 384]);
537 c7
= _mm_load_ps(&qmf_c
[k
+ 448]);
538 c8
= _mm_load_ps(&qmf_c
[k
+ 512]);
539 c9
= _mm_load_ps(&qmf_c
[k
+ 576]);
541 m5
= _mm_mul_ps(m5
, c5
);
542 m6
= _mm_mul_ps(m6
, c6
);
543 m7
= _mm_mul_ps(m7
, c7
);
544 m8
= _mm_mul_ps(m8
, c8
);
545 m9
= _mm_mul_ps(m9
, c9
);
547 s3
= _mm_add_ps(m4
, m5
);
548 s4
= _mm_add_ps(m6
, m7
);
549 s5
= _mm_add_ps(m8
, m9
);
550 s7
= _mm_add_ps(s3
, s4
);
551 s8
= _mm_add_ps(s5
, s6
);
552 s9
= _mm_add_ps(s7
, s8
);
554 _mm_store_ps(&output
[out
], s9
);