avi subtitle stream dumper
[mplayer/greg.git] / libfaad2 / filtbank.c
blob15798847c2693b5b8bc39cc2532fdfb6dbb323ba
1 /*
2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
4 **
5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version.
9 **
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
14 **
15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 ** Any non-GPL usage of this software or parts of this software is strictly
20 ** forbidden.
22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
25 ** $Id: filtbank.c,v 1.38 2004/06/30 12:45:56 menno Exp $
26 **/
28 #include "common.h"
29 #include "structs.h"
31 #include <stdlib.h>
32 #include <string.h>
33 #ifdef _WIN32_WCE
34 #define assert(x)
35 #else
36 #include <assert.h>
37 #endif
39 #include "filtbank.h"
40 #include "decoder.h"
41 #include "syntax.h"
42 #include "kbd_win.h"
43 #include "sine_win.h"
44 #include "mdct.h"
47 fb_info *filter_bank_init(uint16_t frame_len)
49 uint16_t nshort = frame_len/8;
50 #ifdef LD_DEC
51 uint16_t frame_len_ld = frame_len/2;
52 #endif
54 fb_info *fb = (fb_info*)faad_malloc(sizeof(fb_info));
55 memset(fb, 0, sizeof(fb_info));
57 /* normal */
58 fb->mdct256 = faad_mdct_init(2*nshort);
59 fb->mdct2048 = faad_mdct_init(2*frame_len);
60 #ifdef LD_DEC
61 /* LD */
62 fb->mdct1024 = faad_mdct_init(2*frame_len_ld);
63 #endif
65 #ifdef ALLOW_SMALL_FRAMELENGTH
66 if (frame_len == 1024)
68 #endif
69 fb->long_window[0] = sine_long_1024;
70 fb->short_window[0] = sine_short_128;
71 fb->long_window[1] = kbd_long_1024;
72 fb->short_window[1] = kbd_short_128;
73 #ifdef LD_DEC
74 fb->ld_window[0] = sine_mid_512;
75 fb->ld_window[1] = ld_mid_512;
76 #endif
77 #ifdef ALLOW_SMALL_FRAMELENGTH
78 } else /* (frame_len == 960) */ {
79 fb->long_window[0] = sine_long_960;
80 fb->short_window[0] = sine_short_120;
81 fb->long_window[1] = kbd_long_960;
82 fb->short_window[1] = kbd_short_120;
83 #ifdef LD_DEC
84 fb->ld_window[0] = sine_mid_480;
85 fb->ld_window[1] = ld_mid_480;
86 #endif
88 #endif
90 #ifdef USE_SSE
91 if (cpu_has_sse())
93 fb->if_func = ifilter_bank_sse;
94 } else {
95 fb->if_func = ifilter_bank;
97 #endif
99 return fb;
102 void filter_bank_end(fb_info *fb)
104 if (fb != NULL)
106 #ifdef PROFILE
107 printf("FB: %I64d cycles\n", fb->cycles);
108 #endif
110 faad_mdct_end(fb->mdct256);
111 faad_mdct_end(fb->mdct2048);
112 #ifdef LD_DEC
113 faad_mdct_end(fb->mdct1024);
114 #endif
116 faad_free(fb);
120 static INLINE void imdct_long(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
122 #ifdef LD_DEC
123 mdct_info *mdct = NULL;
125 switch (len)
127 case 2048:
128 case 1920:
129 mdct = fb->mdct2048;
130 break;
131 case 1024:
132 case 960:
133 mdct = fb->mdct1024;
134 break;
137 faad_imdct(mdct, in_data, out_data);
138 #else
139 faad_imdct(fb->mdct2048, in_data, out_data);
140 #endif
143 #ifdef USE_SSE
144 static INLINE void imdct_long_sse(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
146 #ifdef LD_DEC
147 mdct_info *mdct = NULL;
149 switch (len)
151 case 2048:
152 case 1920:
153 mdct = fb->mdct2048;
154 break;
155 case 1024:
156 case 960:
157 mdct = fb->mdct1024;
158 break;
161 faad_imdct_sse(mdct, in_data, out_data);
162 #else
163 faad_imdct_sse(fb->mdct2048, in_data, out_data);
164 #endif
166 #endif
168 #ifdef LTP_DEC
169 static INLINE void mdct(fb_info *fb, real_t *in_data, real_t *out_data, uint16_t len)
171 mdct_info *mdct = NULL;
173 switch (len)
175 case 2048:
176 case 1920:
177 mdct = fb->mdct2048;
178 break;
179 case 256:
180 case 240:
181 mdct = fb->mdct256;
182 break;
183 #ifdef LD_DEC
184 case 1024:
185 case 960:
186 mdct = fb->mdct1024;
187 break;
188 #endif
191 faad_mdct(mdct, in_data, out_data);
193 #endif
195 void ifilter_bank(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
196 uint8_t window_shape_prev, real_t *freq_in,
197 real_t *time_out, real_t *overlap,
198 uint8_t object_type, uint16_t frame_len)
200 int16_t i;
201 ALIGN real_t transf_buf[2*1024] = {0};
203 const real_t *window_long = NULL;
204 const real_t *window_long_prev = NULL;
205 const real_t *window_short = NULL;
206 const real_t *window_short_prev = NULL;
208 uint16_t nlong = frame_len;
209 uint16_t nshort = frame_len/8;
210 uint16_t trans = nshort/2;
212 uint16_t nflat_ls = (nlong-nshort)/2;
214 #ifdef PROFILE
215 int64_t count = faad_get_ts();
216 #endif
218 /* select windows of current frame and previous frame (Sine or KBD) */
219 #ifdef LD_DEC
220 if (object_type == LD)
222 window_long = fb->ld_window[window_shape];
223 window_long_prev = fb->ld_window[window_shape_prev];
224 } else {
225 #endif
226 window_long = fb->long_window[window_shape];
227 window_long_prev = fb->long_window[window_shape_prev];
228 window_short = fb->short_window[window_shape];
229 window_short_prev = fb->short_window[window_shape_prev];
230 #ifdef LD_DEC
232 #endif
234 #if 0
235 for (i = 0; i < 1024; i++)
237 printf("%d\n", freq_in[i]);
239 #endif
241 #if 0
242 printf("%d %d\n", window_sequence, window_shape);
243 #endif
245 switch (window_sequence)
247 case ONLY_LONG_SEQUENCE:
248 /* perform iMDCT */
249 imdct_long(fb, freq_in, transf_buf, 2*nlong);
251 /* add second half output of previous frame to windowed output of current frame */
252 for (i = 0; i < nlong; i+=4)
254 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
255 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
256 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
257 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
260 /* window the second half and save as overlap for next frame */
261 for (i = 0; i < nlong; i+=4)
263 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
264 overlap[i+1] = MUL_F(transf_buf[nlong+i+1],window_long[nlong-2-i]);
265 overlap[i+2] = MUL_F(transf_buf[nlong+i+2],window_long[nlong-3-i]);
266 overlap[i+3] = MUL_F(transf_buf[nlong+i+3],window_long[nlong-4-i]);
268 break;
270 case LONG_START_SEQUENCE:
271 /* perform iMDCT */
272 imdct_long(fb, freq_in, transf_buf, 2*nlong);
274 /* add second half output of previous frame to windowed output of current frame */
275 for (i = 0; i < nlong; i+=4)
277 time_out[i] = overlap[i] + MUL_F(transf_buf[i],window_long_prev[i]);
278 time_out[i+1] = overlap[i+1] + MUL_F(transf_buf[i+1],window_long_prev[i+1]);
279 time_out[i+2] = overlap[i+2] + MUL_F(transf_buf[i+2],window_long_prev[i+2]);
280 time_out[i+3] = overlap[i+3] + MUL_F(transf_buf[i+3],window_long_prev[i+3]);
283 /* window the second half and save as overlap for next frame */
284 /* construct second half window using padding with 1's and 0's */
285 for (i = 0; i < nflat_ls; i++)
286 overlap[i] = transf_buf[nlong+i];
287 for (i = 0; i < nshort; i++)
288 overlap[nflat_ls+i] = MUL_F(transf_buf[nlong+nflat_ls+i],window_short[nshort-i-1]);
289 for (i = 0; i < nflat_ls; i++)
290 overlap[nflat_ls+nshort+i] = 0;
291 break;
293 case EIGHT_SHORT_SEQUENCE:
294 /* perform iMDCT for each short block */
295 faad_imdct(fb->mdct256, freq_in+0*nshort, transf_buf+2*nshort*0);
296 faad_imdct(fb->mdct256, freq_in+1*nshort, transf_buf+2*nshort*1);
297 faad_imdct(fb->mdct256, freq_in+2*nshort, transf_buf+2*nshort*2);
298 faad_imdct(fb->mdct256, freq_in+3*nshort, transf_buf+2*nshort*3);
299 faad_imdct(fb->mdct256, freq_in+4*nshort, transf_buf+2*nshort*4);
300 faad_imdct(fb->mdct256, freq_in+5*nshort, transf_buf+2*nshort*5);
301 faad_imdct(fb->mdct256, freq_in+6*nshort, transf_buf+2*nshort*6);
302 faad_imdct(fb->mdct256, freq_in+7*nshort, transf_buf+2*nshort*7);
304 /* add second half output of previous frame to windowed output of current frame */
305 for (i = 0; i < nflat_ls; i++)
306 time_out[i] = overlap[i];
307 for(i = 0; i < nshort; i++)
309 time_out[nflat_ls+ i] = overlap[nflat_ls+ i] + MUL_F(transf_buf[nshort*0+i],window_short_prev[i]);
310 time_out[nflat_ls+1*nshort+i] = overlap[nflat_ls+nshort*1+i] + MUL_F(transf_buf[nshort*1+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*2+i],window_short[i]);
311 time_out[nflat_ls+2*nshort+i] = overlap[nflat_ls+nshort*2+i] + MUL_F(transf_buf[nshort*3+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*4+i],window_short[i]);
312 time_out[nflat_ls+3*nshort+i] = overlap[nflat_ls+nshort*3+i] + MUL_F(transf_buf[nshort*5+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*6+i],window_short[i]);
313 if (i < trans)
314 time_out[nflat_ls+4*nshort+i] = overlap[nflat_ls+nshort*4+i] + MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
317 /* window the second half and save as overlap for next frame */
318 for(i = 0; i < nshort; i++)
320 if (i >= trans)
321 overlap[nflat_ls+4*nshort+i-nlong] = MUL_F(transf_buf[nshort*7+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*8+i],window_short[i]);
322 overlap[nflat_ls+5*nshort+i-nlong] = MUL_F(transf_buf[nshort*9+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*10+i],window_short[i]);
323 overlap[nflat_ls+6*nshort+i-nlong] = MUL_F(transf_buf[nshort*11+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*12+i],window_short[i]);
324 overlap[nflat_ls+7*nshort+i-nlong] = MUL_F(transf_buf[nshort*13+i],window_short[nshort-1-i]) + MUL_F(transf_buf[nshort*14+i],window_short[i]);
325 overlap[nflat_ls+8*nshort+i-nlong] = MUL_F(transf_buf[nshort*15+i],window_short[nshort-1-i]);
327 for (i = 0; i < nflat_ls; i++)
328 overlap[nflat_ls+nshort+i] = 0;
329 break;
331 case LONG_STOP_SEQUENCE:
332 /* perform iMDCT */
333 imdct_long(fb, freq_in, transf_buf, 2*nlong);
335 /* add second half output of previous frame to windowed output of current frame */
336 /* construct first half window using padding with 1's and 0's */
337 for (i = 0; i < nflat_ls; i++)
338 time_out[i] = overlap[i];
339 for (i = 0; i < nshort; i++)
340 time_out[nflat_ls+i] = overlap[nflat_ls+i] + MUL_F(transf_buf[nflat_ls+i],window_short_prev[i]);
341 for (i = 0; i < nflat_ls; i++)
342 time_out[nflat_ls+nshort+i] = overlap[nflat_ls+nshort+i] + transf_buf[nflat_ls+nshort+i];
344 /* window the second half and save as overlap for next frame */
345 for (i = 0; i < nlong; i++)
346 overlap[i] = MUL_F(transf_buf[nlong+i],window_long[nlong-1-i]);
347 break;
350 #if 0
351 for (i = 0; i < 1024; i++)
353 //printf("%d\n", time_out[i]);
354 printf("0x%.8X\n", time_out[i]);
356 #endif
359 #ifdef PROFILE
360 count = faad_get_ts() - count;
361 fb->cycles += count;
362 #endif
365 #ifdef USE_SSE
366 void ifilter_bank_sse(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
367 uint8_t window_shape_prev, real_t *freq_in,
368 real_t *time_out, uint8_t object_type, uint16_t frame_len)
370 int16_t i;
371 ALIGN real_t transf_buf[2*1024] = {0};
373 const real_t *window_long = NULL;
374 const real_t *window_long_prev = NULL;
375 const real_t *window_short = NULL;
376 const real_t *window_short_prev = NULL;
378 uint16_t nlong = frame_len;
379 uint16_t nshort = frame_len/8;
380 uint16_t trans = nshort/2;
382 uint16_t nflat_ls = (nlong-nshort)/2;
384 #ifdef PROFILE
385 int64_t count = faad_get_ts();
386 #endif
388 #ifdef LD_DEC
389 if (object_type == LD)
391 window_long = fb->ld_window[window_shape];
392 window_long_prev = fb->ld_window[window_shape_prev];
393 } else {
394 #endif
395 window_long = fb->long_window[window_shape];
396 window_long_prev = fb->long_window[window_shape_prev];
397 window_short = fb->short_window[window_shape];
398 window_short_prev = fb->short_window[window_shape_prev];
399 #ifdef LD_DEC
401 #endif
403 switch (window_sequence)
405 case ONLY_LONG_SEQUENCE:
406 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
407 for (i = 0; i < nlong; i+=4)
409 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
411 m1 = _mm_load_ps(&transf_buf[i]);
412 m2 = _mm_load_ps(&window_long_prev[i]);
413 m6 = _mm_load_ps(&window_long[nlong-4-i]);
414 m3 = _mm_load_ps(&time_out[nlong+i]);
415 m5 = _mm_load_ps(&transf_buf[nlong+i]);
417 m4 = _mm_mul_ps(m1, m2);
418 m7 = _mm_shuffle_ps(m6, m6, _MM_SHUFFLE(0, 1, 2, 3));
420 m4 = _mm_add_ps(m4, m3);
421 m8 = _mm_mul_ps(m5, m7);
423 _mm_store_ps(&time_out[i], m4);
424 _mm_store_ps(&time_out[nlong+i], m8);
426 break;
428 case LONG_START_SEQUENCE:
429 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
430 for (i = 0; i < nlong; i+=4)
432 __m128 m1 = _mm_load_ps(&transf_buf[i]);
433 __m128 m2 = _mm_load_ps(&window_long_prev[i]);
434 __m128 m3 = _mm_load_ps(&time_out[nlong+i]);
436 __m128 m4 = _mm_mul_ps(m1, m2);
437 m4 = _mm_add_ps(m4, m3);
439 _mm_store_ps(&time_out[i], m4);
441 for (i = 0; i < nflat_ls; i+=4)
443 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
444 _mm_store_ps(&time_out[nlong+i], m1);
446 for (i = 0; i < nshort; i+=4)
448 __m128 m1 = _mm_load_ps(&transf_buf[nlong+nflat_ls+i]);
449 __m128 m2 = _mm_load_ps(&window_short[nshort-4-i]);
450 __m128 m3, m4;
452 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
454 m4 = _mm_mul_ps(m1, m3);
456 _mm_store_ps(&time_out[nlong+nflat_ls+i], m4);
458 for (i = 0; i < nflat_ls; i+=4)
460 __m128 m1 = _mm_setzero_ps();
461 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
463 break;
465 case EIGHT_SHORT_SEQUENCE:
466 faad_imdct_sse(fb->mdct256, &freq_in[0*nshort], &transf_buf[2*nshort*0]);
467 faad_imdct_sse(fb->mdct256, &freq_in[1*nshort], &transf_buf[2*nshort*1]);
468 faad_imdct_sse(fb->mdct256, &freq_in[2*nshort], &transf_buf[2*nshort*2]);
469 faad_imdct_sse(fb->mdct256, &freq_in[3*nshort], &transf_buf[2*nshort*3]);
470 faad_imdct_sse(fb->mdct256, &freq_in[4*nshort], &transf_buf[2*nshort*4]);
471 faad_imdct_sse(fb->mdct256, &freq_in[5*nshort], &transf_buf[2*nshort*5]);
472 faad_imdct_sse(fb->mdct256, &freq_in[6*nshort], &transf_buf[2*nshort*6]);
473 faad_imdct_sse(fb->mdct256, &freq_in[7*nshort], &transf_buf[2*nshort*7]);
474 for (i = 0; i < nflat_ls; i+=4)
476 __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
477 _mm_store_ps(&time_out[i], m1);
479 for (i = 0; i < nshort; i+=4)
481 __m128 m1 = _mm_load_ps(&transf_buf[nshort*0+i]);
482 __m128 m2 = _mm_load_ps(&window_short_prev[i]);
483 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
485 __m128 m4 = _mm_mul_ps(m1, m2);
486 m4 = _mm_add_ps(m4, m3);
488 _mm_store_ps(&time_out[nflat_ls+i], m4);
490 for (i = 0; i < nshort; i+=4)
492 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
493 m1 = _mm_load_ps(&transf_buf[nshort*1+i]);
494 m2 = _mm_load_ps(&window_short[nshort-4-i]);
495 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*1+i]);
496 m6 = _mm_load_ps(&transf_buf[nshort*2+i]);
497 m7 = _mm_load_ps(&window_short[i]);
499 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
501 m4 = _mm_mul_ps(m1, m5);
502 m8 = _mm_mul_ps(m6, m7);
503 m4 = _mm_add_ps(m4, m3);
504 m4 = _mm_add_ps(m4, m8);
506 _mm_store_ps(&time_out[nflat_ls+1*nshort+i], m4);
508 for (i = 0; i < nshort; i+=4)
510 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
511 m1 = _mm_load_ps(&transf_buf[nshort*3+i]);
512 m2 = _mm_load_ps(&window_short[nshort-4-i]);
513 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*2+i]);
514 m6 = _mm_load_ps(&transf_buf[nshort*4+i]);
515 m7 = _mm_load_ps(&window_short[i]);
517 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
519 m4 = _mm_mul_ps(m1, m5);
520 m8 = _mm_mul_ps(m6, m7);
521 m4 = _mm_add_ps(m4, m3);
522 m4 = _mm_add_ps(m4, m8);
524 _mm_store_ps(&time_out[nflat_ls+2*nshort+i], m4);
526 for (i = 0; i < nshort; i+=4)
528 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
529 m1 = _mm_load_ps(&transf_buf[nshort*5+i]);
530 m2 = _mm_load_ps(&window_short[nshort-4-i]);
531 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*3+i]);
532 m6 = _mm_load_ps(&transf_buf[nshort*6+i]);
533 m7 = _mm_load_ps(&window_short[i]);
535 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
537 m4 = _mm_mul_ps(m1, m5);
538 m8 = _mm_mul_ps(m6, m7);
539 m4 = _mm_add_ps(m4, m3);
540 m4 = _mm_add_ps(m4, m8);
542 _mm_store_ps(&time_out[nflat_ls+3*nshort+i], m4);
544 for(i = 0; i < trans; i+=4)
546 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
547 m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
548 m2 = _mm_load_ps(&window_short[nshort-4-i]);
549 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort*4+i]);
550 m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
551 m7 = _mm_load_ps(&window_short[i]);
553 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
555 m4 = _mm_mul_ps(m1, m5);
556 m8 = _mm_mul_ps(m6, m7);
557 m4 = _mm_add_ps(m4, m3);
558 m4 = _mm_add_ps(m4, m8);
560 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m4);
562 for (i = trans; i < nshort; i+=4)
564 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
565 m1 = _mm_load_ps(&transf_buf[nshort*7+i]);
566 m2 = _mm_load_ps(&window_short[nshort-4-i]);
567 m6 = _mm_load_ps(&transf_buf[nshort*8+i]);
568 m7 = _mm_load_ps(&window_short[i]);
570 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
572 m4 = _mm_mul_ps(m1, m5);
573 m8 = _mm_mul_ps(m6, m7);
574 m3 = _mm_add_ps(m4, m8);
576 _mm_store_ps(&time_out[nflat_ls+4*nshort+i], m3);
578 for (i = 0; i < nshort; i+=4)
580 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
581 m1 = _mm_load_ps(&transf_buf[nshort*9+i]);
582 m2 = _mm_load_ps(&window_short[nshort-4-i]);
583 m6 = _mm_load_ps(&transf_buf[nshort*10+i]);
584 m7 = _mm_load_ps(&window_short[i]);
586 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
588 m4 = _mm_mul_ps(m1, m5);
589 m8 = _mm_mul_ps(m6, m7);
590 m3 = _mm_add_ps(m4, m8);
592 _mm_store_ps(&time_out[nflat_ls+5*nshort+i], m3);
594 for (i = 0; i < nshort; i+=4)
596 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
597 m1 = _mm_load_ps(&transf_buf[nshort*11+i]);
598 m2 = _mm_load_ps(&window_short[nshort-4-i]);
599 m6 = _mm_load_ps(&transf_buf[nshort*12+i]);
600 m7 = _mm_load_ps(&window_short[i]);
602 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
604 m4 = _mm_mul_ps(m1, m5);
605 m8 = _mm_mul_ps(m6, m7);
606 m3 = _mm_add_ps(m4, m8);
608 _mm_store_ps(&time_out[nflat_ls+6*nshort+i], m3);
610 for (i = 0; i < nshort; i+=4)
612 __m128 m1, m2, m3, m4, m5, m6, m7, m8;
613 m1 = _mm_load_ps(&transf_buf[nshort*13+i]);
614 m2 = _mm_load_ps(&window_short[nshort-4-i]);
615 m6 = _mm_load_ps(&transf_buf[nshort*14+i]);
616 m7 = _mm_load_ps(&window_short[i]);
618 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
620 m4 = _mm_mul_ps(m1, m5);
621 m8 = _mm_mul_ps(m6, m7);
622 m3 = _mm_add_ps(m4, m8);
624 _mm_store_ps(&time_out[nflat_ls+7*nshort+i], m3);
626 for (i = 0; i < nshort; i+=4)
628 __m128 m1, m2, m3, m5;
629 m1 = _mm_load_ps(&transf_buf[nshort*15+i]);
630 m2 = _mm_load_ps(&window_short[nshort-4-i]);
632 m5 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
634 m3 = _mm_mul_ps(m1, m5);
636 _mm_store_ps(&time_out[nflat_ls+8*nshort+i], m3);
638 for (i = 0; i < nflat_ls; i+=4)
640 __m128 m1 = _mm_setzero_ps();
641 _mm_store_ps(&time_out[nlong+nflat_ls+nshort+i], m1);
643 break;
645 case LONG_STOP_SEQUENCE:
646 imdct_long_sse(fb, freq_in, transf_buf, 2*nlong);
647 for (i = 0; i < nflat_ls; i+=4)
649 __m128 m1 = _mm_load_ps(&time_out[nlong+i]);
650 _mm_store_ps(&time_out[i], m1);
652 for (i = 0; i < nshort; i+=4)
654 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+i]);
655 __m128 m2 = _mm_load_ps(&window_short_prev[i]);
656 __m128 m3 = _mm_load_ps(&time_out[nlong+nflat_ls+i]);
658 __m128 m4 = _mm_mul_ps(m1, m2);
659 m4 = _mm_add_ps(m4, m3);
661 _mm_store_ps(&time_out[nflat_ls+i], m4);
663 for (i = 0; i < nflat_ls; i+=4)
665 __m128 m1 = _mm_load_ps(&transf_buf[nflat_ls+nshort+i]);
666 __m128 m2 = _mm_load_ps(&time_out[nlong+nflat_ls+nshort+i]);
668 __m128 m3 = _mm_add_ps(m1, m2);
670 _mm_store_ps(&time_out[nflat_ls+nshort+i], m3);
672 for (i = 0; i < nlong; i+=4)
674 __m128 m1 = _mm_load_ps(&transf_buf[nlong+i]);
675 __m128 m2 = _mm_load_ps(&window_long[nlong-4-i]);
676 __m128 m3, m4;
678 m3 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(0, 1, 2, 3));
680 m4 = _mm_mul_ps(m1, m3);
682 _mm_store_ps(&time_out[nlong+i], m4);
684 break;
687 #ifdef PROFILE
688 count = faad_get_ts() - count;
689 fb->cycles += count;
690 #endif
692 #endif
694 #ifdef LTP_DEC
695 /* only works for LTP -> no overlapping, no short blocks */
696 void filter_bank_ltp(fb_info *fb, uint8_t window_sequence, uint8_t window_shape,
697 uint8_t window_shape_prev, real_t *in_data, real_t *out_mdct,
698 uint8_t object_type, uint16_t frame_len)
700 int16_t i;
701 ALIGN real_t windowed_buf[2*1024] = {0};
703 const real_t *window_long = NULL;
704 const real_t *window_long_prev = NULL;
705 const real_t *window_short = NULL;
706 const real_t *window_short_prev = NULL;
708 uint16_t nlong = frame_len;
709 uint16_t nshort = frame_len/8;
710 uint16_t nflat_ls = (nlong-nshort)/2;
712 assert(window_sequence != EIGHT_SHORT_SEQUENCE);
714 #ifdef LD_DEC
715 if (object_type == LD)
717 window_long = fb->ld_window[window_shape];
718 window_long_prev = fb->ld_window[window_shape_prev];
719 } else {
720 #endif
721 window_long = fb->long_window[window_shape];
722 window_long_prev = fb->long_window[window_shape_prev];
723 window_short = fb->short_window[window_shape];
724 window_short_prev = fb->short_window[window_shape_prev];
725 #ifdef LD_DEC
727 #endif
729 switch(window_sequence)
731 case ONLY_LONG_SEQUENCE:
732 for (i = nlong-1; i >= 0; i--)
734 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
735 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
737 mdct(fb, windowed_buf, out_mdct, 2*nlong);
738 break;
740 case LONG_START_SEQUENCE:
741 for (i = 0; i < nlong; i++)
742 windowed_buf[i] = MUL_F(in_data[i], window_long_prev[i]);
743 for (i = 0; i < nflat_ls; i++)
744 windowed_buf[i+nlong] = in_data[i+nlong];
745 for (i = 0; i < nshort; i++)
746 windowed_buf[i+nlong+nflat_ls] = MUL_F(in_data[i+nlong+nflat_ls], window_short[nshort-1-i]);
747 for (i = 0; i < nflat_ls; i++)
748 windowed_buf[i+nlong+nflat_ls+nshort] = 0;
749 mdct(fb, windowed_buf, out_mdct, 2*nlong);
750 break;
752 case LONG_STOP_SEQUENCE:
753 for (i = 0; i < nflat_ls; i++)
754 windowed_buf[i] = 0;
755 for (i = 0; i < nshort; i++)
756 windowed_buf[i+nflat_ls] = MUL_F(in_data[i+nflat_ls], window_short_prev[i]);
757 for (i = 0; i < nflat_ls; i++)
758 windowed_buf[i+nflat_ls+nshort] = in_data[i+nflat_ls+nshort];
759 for (i = 0; i < nlong; i++)
760 windowed_buf[i+nlong] = MUL_F(in_data[i+nlong], window_long[nlong-1-i]);
761 mdct(fb, windowed_buf, out_mdct, 2*nlong);
762 break;
765 #endif