2 ** FAAD2 - Freeware Advanced Audio (AAC) Decoder including SBR decoding
3 ** Copyright (C) 2003-2004 M. Bakker, Ahead Software AG, http://www.nero.com
5 ** This program is free software; you can redistribute it and/or modify
6 ** it under the terms of the GNU General Public License as published by
7 ** the Free Software Foundation; either version 2 of the License, or
8 ** (at your option) any later version.
10 ** This program is distributed in the hope that it will be useful,
11 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ** GNU General Public License for more details.
15 ** You should have received a copy of the GNU General Public License
16 ** along with this program; if not, write to the Free Software
17 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 ** Any non-GPL usage of this software or parts of this software is strictly
22 ** Commercial non-GPL licensing of this software is possible.
23 ** For more info contact Ahead Software through Mpeg4AAClicense@nero.com.
25 ** $Id: filtbank.c,v 1.38 2004/06/30 12:45:56 menno Exp $
47 fb_info
*filter_bank_init(uint16_t frame_len
)
49 uint16_t nshort
= frame_len
/8;
51 uint16_t frame_len_ld
= frame_len
/2;
54 fb_info
*fb
= (fb_info
*)faad_malloc(sizeof(fb_info
));
55 memset(fb
, 0, sizeof(fb_info
));
58 fb
->mdct256
= faad_mdct_init(2*nshort
);
59 fb
->mdct2048
= faad_mdct_init(2*frame_len
);
62 fb
->mdct1024
= faad_mdct_init(2*frame_len_ld
);
65 #ifdef ALLOW_SMALL_FRAMELENGTH
66 if (frame_len
== 1024)
69 fb
->long_window
[0] = sine_long_1024
;
70 fb
->short_window
[0] = sine_short_128
;
71 fb
->long_window
[1] = kbd_long_1024
;
72 fb
->short_window
[1] = kbd_short_128
;
74 fb
->ld_window
[0] = sine_mid_512
;
75 fb
->ld_window
[1] = ld_mid_512
;
77 #ifdef ALLOW_SMALL_FRAMELENGTH
78 } else /* (frame_len == 960) */ {
79 fb
->long_window
[0] = sine_long_960
;
80 fb
->short_window
[0] = sine_short_120
;
81 fb
->long_window
[1] = kbd_long_960
;
82 fb
->short_window
[1] = kbd_short_120
;
84 fb
->ld_window
[0] = sine_mid_480
;
85 fb
->ld_window
[1] = ld_mid_480
;
93 fb
->if_func
= ifilter_bank_sse
;
95 fb
->if_func
= ifilter_bank
;
102 void filter_bank_end(fb_info
*fb
)
107 printf("FB: %I64d cycles\n", fb
->cycles
);
110 faad_mdct_end(fb
->mdct256
);
111 faad_mdct_end(fb
->mdct2048
);
113 faad_mdct_end(fb
->mdct1024
);
120 static INLINE
void imdct_long(fb_info
*fb
, real_t
*in_data
, real_t
*out_data
, uint16_t len
)
123 mdct_info
*mdct
= NULL
;
137 faad_imdct(mdct
, in_data
, out_data
);
139 faad_imdct(fb
->mdct2048
, in_data
, out_data
);
144 static INLINE
void imdct_long_sse(fb_info
*fb
, real_t
*in_data
, real_t
*out_data
, uint16_t len
)
147 mdct_info
*mdct
= NULL
;
161 faad_imdct_sse(mdct
, in_data
, out_data
);
163 faad_imdct_sse(fb
->mdct2048
, in_data
, out_data
);
169 static INLINE
void mdct(fb_info
*fb
, real_t
*in_data
, real_t
*out_data
, uint16_t len
)
171 mdct_info
*mdct
= NULL
;
191 faad_mdct(mdct
, in_data
, out_data
);
195 void ifilter_bank(fb_info
*fb
, uint8_t window_sequence
, uint8_t window_shape
,
196 uint8_t window_shape_prev
, real_t
*freq_in
,
197 real_t
*time_out
, real_t
*overlap
,
198 uint8_t object_type
, uint16_t frame_len
)
201 ALIGN real_t transf_buf
[2*1024] = {0};
203 const real_t
*window_long
= NULL
;
204 const real_t
*window_long_prev
= NULL
;
205 const real_t
*window_short
= NULL
;
206 const real_t
*window_short_prev
= NULL
;
208 uint16_t nlong
= frame_len
;
209 uint16_t nshort
= frame_len
/8;
210 uint16_t trans
= nshort
/2;
212 uint16_t nflat_ls
= (nlong
-nshort
)/2;
215 int64_t count
= faad_get_ts();
218 /* select windows of current frame and previous frame (Sine or KBD) */
220 if (object_type
== LD
)
222 window_long
= fb
->ld_window
[window_shape
];
223 window_long_prev
= fb
->ld_window
[window_shape_prev
];
226 window_long
= fb
->long_window
[window_shape
];
227 window_long_prev
= fb
->long_window
[window_shape_prev
];
228 window_short
= fb
->short_window
[window_shape
];
229 window_short_prev
= fb
->short_window
[window_shape_prev
];
235 for (i
= 0; i
< 1024; i
++)
237 printf("%d\n", freq_in
[i
]);
242 printf("%d %d\n", window_sequence
, window_shape
);
245 switch (window_sequence
)
247 case ONLY_LONG_SEQUENCE
:
249 imdct_long(fb
, freq_in
, transf_buf
, 2*nlong
);
251 /* add second half output of previous frame to windowed output of current frame */
252 for (i
= 0; i
< nlong
; i
+=4)
254 time_out
[i
] = overlap
[i
] + MUL_F(transf_buf
[i
],window_long_prev
[i
]);
255 time_out
[i
+1] = overlap
[i
+1] + MUL_F(transf_buf
[i
+1],window_long_prev
[i
+1]);
256 time_out
[i
+2] = overlap
[i
+2] + MUL_F(transf_buf
[i
+2],window_long_prev
[i
+2]);
257 time_out
[i
+3] = overlap
[i
+3] + MUL_F(transf_buf
[i
+3],window_long_prev
[i
+3]);
260 /* window the second half and save as overlap for next frame */
261 for (i
= 0; i
< nlong
; i
+=4)
263 overlap
[i
] = MUL_F(transf_buf
[nlong
+i
],window_long
[nlong
-1-i
]);
264 overlap
[i
+1] = MUL_F(transf_buf
[nlong
+i
+1],window_long
[nlong
-2-i
]);
265 overlap
[i
+2] = MUL_F(transf_buf
[nlong
+i
+2],window_long
[nlong
-3-i
]);
266 overlap
[i
+3] = MUL_F(transf_buf
[nlong
+i
+3],window_long
[nlong
-4-i
]);
270 case LONG_START_SEQUENCE
:
272 imdct_long(fb
, freq_in
, transf_buf
, 2*nlong
);
274 /* add second half output of previous frame to windowed output of current frame */
275 for (i
= 0; i
< nlong
; i
+=4)
277 time_out
[i
] = overlap
[i
] + MUL_F(transf_buf
[i
],window_long_prev
[i
]);
278 time_out
[i
+1] = overlap
[i
+1] + MUL_F(transf_buf
[i
+1],window_long_prev
[i
+1]);
279 time_out
[i
+2] = overlap
[i
+2] + MUL_F(transf_buf
[i
+2],window_long_prev
[i
+2]);
280 time_out
[i
+3] = overlap
[i
+3] + MUL_F(transf_buf
[i
+3],window_long_prev
[i
+3]);
283 /* window the second half and save as overlap for next frame */
284 /* construct second half window using padding with 1's and 0's */
285 for (i
= 0; i
< nflat_ls
; i
++)
286 overlap
[i
] = transf_buf
[nlong
+i
];
287 for (i
= 0; i
< nshort
; i
++)
288 overlap
[nflat_ls
+i
] = MUL_F(transf_buf
[nlong
+nflat_ls
+i
],window_short
[nshort
-i
-1]);
289 for (i
= 0; i
< nflat_ls
; i
++)
290 overlap
[nflat_ls
+nshort
+i
] = 0;
293 case EIGHT_SHORT_SEQUENCE
:
294 /* perform iMDCT for each short block */
295 faad_imdct(fb
->mdct256
, freq_in
+0*nshort
, transf_buf
+2*nshort
*0);
296 faad_imdct(fb
->mdct256
, freq_in
+1*nshort
, transf_buf
+2*nshort
*1);
297 faad_imdct(fb
->mdct256
, freq_in
+2*nshort
, transf_buf
+2*nshort
*2);
298 faad_imdct(fb
->mdct256
, freq_in
+3*nshort
, transf_buf
+2*nshort
*3);
299 faad_imdct(fb
->mdct256
, freq_in
+4*nshort
, transf_buf
+2*nshort
*4);
300 faad_imdct(fb
->mdct256
, freq_in
+5*nshort
, transf_buf
+2*nshort
*5);
301 faad_imdct(fb
->mdct256
, freq_in
+6*nshort
, transf_buf
+2*nshort
*6);
302 faad_imdct(fb
->mdct256
, freq_in
+7*nshort
, transf_buf
+2*nshort
*7);
304 /* add second half output of previous frame to windowed output of current frame */
305 for (i
= 0; i
< nflat_ls
; i
++)
306 time_out
[i
] = overlap
[i
];
307 for(i
= 0; i
< nshort
; i
++)
309 time_out
[nflat_ls
+ i
] = overlap
[nflat_ls
+ i
] + MUL_F(transf_buf
[nshort
*0+i
],window_short_prev
[i
]);
310 time_out
[nflat_ls
+1*nshort
+i
] = overlap
[nflat_ls
+nshort
*1+i
] + MUL_F(transf_buf
[nshort
*1+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*2+i
],window_short
[i
]);
311 time_out
[nflat_ls
+2*nshort
+i
] = overlap
[nflat_ls
+nshort
*2+i
] + MUL_F(transf_buf
[nshort
*3+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*4+i
],window_short
[i
]);
312 time_out
[nflat_ls
+3*nshort
+i
] = overlap
[nflat_ls
+nshort
*3+i
] + MUL_F(transf_buf
[nshort
*5+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*6+i
],window_short
[i
]);
314 time_out
[nflat_ls
+4*nshort
+i
] = overlap
[nflat_ls
+nshort
*4+i
] + MUL_F(transf_buf
[nshort
*7+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*8+i
],window_short
[i
]);
317 /* window the second half and save as overlap for next frame */
318 for(i
= 0; i
< nshort
; i
++)
321 overlap
[nflat_ls
+4*nshort
+i
-nlong
] = MUL_F(transf_buf
[nshort
*7+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*8+i
],window_short
[i
]);
322 overlap
[nflat_ls
+5*nshort
+i
-nlong
] = MUL_F(transf_buf
[nshort
*9+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*10+i
],window_short
[i
]);
323 overlap
[nflat_ls
+6*nshort
+i
-nlong
] = MUL_F(transf_buf
[nshort
*11+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*12+i
],window_short
[i
]);
324 overlap
[nflat_ls
+7*nshort
+i
-nlong
] = MUL_F(transf_buf
[nshort
*13+i
],window_short
[nshort
-1-i
]) + MUL_F(transf_buf
[nshort
*14+i
],window_short
[i
]);
325 overlap
[nflat_ls
+8*nshort
+i
-nlong
] = MUL_F(transf_buf
[nshort
*15+i
],window_short
[nshort
-1-i
]);
327 for (i
= 0; i
< nflat_ls
; i
++)
328 overlap
[nflat_ls
+nshort
+i
] = 0;
331 case LONG_STOP_SEQUENCE
:
333 imdct_long(fb
, freq_in
, transf_buf
, 2*nlong
);
335 /* add second half output of previous frame to windowed output of current frame */
336 /* construct first half window using padding with 1's and 0's */
337 for (i
= 0; i
< nflat_ls
; i
++)
338 time_out
[i
] = overlap
[i
];
339 for (i
= 0; i
< nshort
; i
++)
340 time_out
[nflat_ls
+i
] = overlap
[nflat_ls
+i
] + MUL_F(transf_buf
[nflat_ls
+i
],window_short_prev
[i
]);
341 for (i
= 0; i
< nflat_ls
; i
++)
342 time_out
[nflat_ls
+nshort
+i
] = overlap
[nflat_ls
+nshort
+i
] + transf_buf
[nflat_ls
+nshort
+i
];
344 /* window the second half and save as overlap for next frame */
345 for (i
= 0; i
< nlong
; i
++)
346 overlap
[i
] = MUL_F(transf_buf
[nlong
+i
],window_long
[nlong
-1-i
]);
351 for (i
= 0; i
< 1024; i
++)
353 //printf("%d\n", time_out[i]);
354 printf("0x%.8X\n", time_out
[i
]);
360 count
= faad_get_ts() - count
;
366 void ifilter_bank_sse(fb_info
*fb
, uint8_t window_sequence
, uint8_t window_shape
,
367 uint8_t window_shape_prev
, real_t
*freq_in
,
368 real_t
*time_out
, uint8_t object_type
, uint16_t frame_len
)
371 ALIGN real_t transf_buf
[2*1024] = {0};
373 const real_t
*window_long
= NULL
;
374 const real_t
*window_long_prev
= NULL
;
375 const real_t
*window_short
= NULL
;
376 const real_t
*window_short_prev
= NULL
;
378 uint16_t nlong
= frame_len
;
379 uint16_t nshort
= frame_len
/8;
380 uint16_t trans
= nshort
/2;
382 uint16_t nflat_ls
= (nlong
-nshort
)/2;
385 int64_t count
= faad_get_ts();
389 if (object_type
== LD
)
391 window_long
= fb
->ld_window
[window_shape
];
392 window_long_prev
= fb
->ld_window
[window_shape_prev
];
395 window_long
= fb
->long_window
[window_shape
];
396 window_long_prev
= fb
->long_window
[window_shape_prev
];
397 window_short
= fb
->short_window
[window_shape
];
398 window_short_prev
= fb
->short_window
[window_shape_prev
];
403 switch (window_sequence
)
405 case ONLY_LONG_SEQUENCE
:
406 imdct_long_sse(fb
, freq_in
, transf_buf
, 2*nlong
);
407 for (i
= 0; i
< nlong
; i
+=4)
409 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
411 m1
= _mm_load_ps(&transf_buf
[i
]);
412 m2
= _mm_load_ps(&window_long_prev
[i
]);
413 m6
= _mm_load_ps(&window_long
[nlong
-4-i
]);
414 m3
= _mm_load_ps(&time_out
[nlong
+i
]);
415 m5
= _mm_load_ps(&transf_buf
[nlong
+i
]);
417 m4
= _mm_mul_ps(m1
, m2
);
418 m7
= _mm_shuffle_ps(m6
, m6
, _MM_SHUFFLE(0, 1, 2, 3));
420 m4
= _mm_add_ps(m4
, m3
);
421 m8
= _mm_mul_ps(m5
, m7
);
423 _mm_store_ps(&time_out
[i
], m4
);
424 _mm_store_ps(&time_out
[nlong
+i
], m8
);
428 case LONG_START_SEQUENCE
:
429 imdct_long_sse(fb
, freq_in
, transf_buf
, 2*nlong
);
430 for (i
= 0; i
< nlong
; i
+=4)
432 __m128 m1
= _mm_load_ps(&transf_buf
[i
]);
433 __m128 m2
= _mm_load_ps(&window_long_prev
[i
]);
434 __m128 m3
= _mm_load_ps(&time_out
[nlong
+i
]);
436 __m128 m4
= _mm_mul_ps(m1
, m2
);
437 m4
= _mm_add_ps(m4
, m3
);
439 _mm_store_ps(&time_out
[i
], m4
);
441 for (i
= 0; i
< nflat_ls
; i
+=4)
443 __m128 m1
= _mm_load_ps(&transf_buf
[nlong
+i
]);
444 _mm_store_ps(&time_out
[nlong
+i
], m1
);
446 for (i
= 0; i
< nshort
; i
+=4)
448 __m128 m1
= _mm_load_ps(&transf_buf
[nlong
+nflat_ls
+i
]);
449 __m128 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
452 m3
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
454 m4
= _mm_mul_ps(m1
, m3
);
456 _mm_store_ps(&time_out
[nlong
+nflat_ls
+i
], m4
);
458 for (i
= 0; i
< nflat_ls
; i
+=4)
460 __m128 m1
= _mm_setzero_ps();
461 _mm_store_ps(&time_out
[nlong
+nflat_ls
+nshort
+i
], m1
);
465 case EIGHT_SHORT_SEQUENCE
:
466 faad_imdct_sse(fb
->mdct256
, &freq_in
[0*nshort
], &transf_buf
[2*nshort
*0]);
467 faad_imdct_sse(fb
->mdct256
, &freq_in
[1*nshort
], &transf_buf
[2*nshort
*1]);
468 faad_imdct_sse(fb
->mdct256
, &freq_in
[2*nshort
], &transf_buf
[2*nshort
*2]);
469 faad_imdct_sse(fb
->mdct256
, &freq_in
[3*nshort
], &transf_buf
[2*nshort
*3]);
470 faad_imdct_sse(fb
->mdct256
, &freq_in
[4*nshort
], &transf_buf
[2*nshort
*4]);
471 faad_imdct_sse(fb
->mdct256
, &freq_in
[5*nshort
], &transf_buf
[2*nshort
*5]);
472 faad_imdct_sse(fb
->mdct256
, &freq_in
[6*nshort
], &transf_buf
[2*nshort
*6]);
473 faad_imdct_sse(fb
->mdct256
, &freq_in
[7*nshort
], &transf_buf
[2*nshort
*7]);
474 for (i
= 0; i
< nflat_ls
; i
+=4)
476 __m128 m1
= _mm_load_ps(&time_out
[nlong
+i
]);
477 _mm_store_ps(&time_out
[i
], m1
);
479 for (i
= 0; i
< nshort
; i
+=4)
481 __m128 m1
= _mm_load_ps(&transf_buf
[nshort
*0+i
]);
482 __m128 m2
= _mm_load_ps(&window_short_prev
[i
]);
483 __m128 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+i
]);
485 __m128 m4
= _mm_mul_ps(m1
, m2
);
486 m4
= _mm_add_ps(m4
, m3
);
488 _mm_store_ps(&time_out
[nflat_ls
+i
], m4
);
490 for (i
= 0; i
< nshort
; i
+=4)
492 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
493 m1
= _mm_load_ps(&transf_buf
[nshort
*1+i
]);
494 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
495 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+nshort
*1+i
]);
496 m6
= _mm_load_ps(&transf_buf
[nshort
*2+i
]);
497 m7
= _mm_load_ps(&window_short
[i
]);
499 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
501 m4
= _mm_mul_ps(m1
, m5
);
502 m8
= _mm_mul_ps(m6
, m7
);
503 m4
= _mm_add_ps(m4
, m3
);
504 m4
= _mm_add_ps(m4
, m8
);
506 _mm_store_ps(&time_out
[nflat_ls
+1*nshort
+i
], m4
);
508 for (i
= 0; i
< nshort
; i
+=4)
510 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
511 m1
= _mm_load_ps(&transf_buf
[nshort
*3+i
]);
512 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
513 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+nshort
*2+i
]);
514 m6
= _mm_load_ps(&transf_buf
[nshort
*4+i
]);
515 m7
= _mm_load_ps(&window_short
[i
]);
517 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
519 m4
= _mm_mul_ps(m1
, m5
);
520 m8
= _mm_mul_ps(m6
, m7
);
521 m4
= _mm_add_ps(m4
, m3
);
522 m4
= _mm_add_ps(m4
, m8
);
524 _mm_store_ps(&time_out
[nflat_ls
+2*nshort
+i
], m4
);
526 for (i
= 0; i
< nshort
; i
+=4)
528 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
529 m1
= _mm_load_ps(&transf_buf
[nshort
*5+i
]);
530 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
531 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+nshort
*3+i
]);
532 m6
= _mm_load_ps(&transf_buf
[nshort
*6+i
]);
533 m7
= _mm_load_ps(&window_short
[i
]);
535 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
537 m4
= _mm_mul_ps(m1
, m5
);
538 m8
= _mm_mul_ps(m6
, m7
);
539 m4
= _mm_add_ps(m4
, m3
);
540 m4
= _mm_add_ps(m4
, m8
);
542 _mm_store_ps(&time_out
[nflat_ls
+3*nshort
+i
], m4
);
544 for(i
= 0; i
< trans
; i
+=4)
546 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
547 m1
= _mm_load_ps(&transf_buf
[nshort
*7+i
]);
548 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
549 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+nshort
*4+i
]);
550 m6
= _mm_load_ps(&transf_buf
[nshort
*8+i
]);
551 m7
= _mm_load_ps(&window_short
[i
]);
553 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
555 m4
= _mm_mul_ps(m1
, m5
);
556 m8
= _mm_mul_ps(m6
, m7
);
557 m4
= _mm_add_ps(m4
, m3
);
558 m4
= _mm_add_ps(m4
, m8
);
560 _mm_store_ps(&time_out
[nflat_ls
+4*nshort
+i
], m4
);
562 for (i
= trans
; i
< nshort
; i
+=4)
564 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
565 m1
= _mm_load_ps(&transf_buf
[nshort
*7+i
]);
566 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
567 m6
= _mm_load_ps(&transf_buf
[nshort
*8+i
]);
568 m7
= _mm_load_ps(&window_short
[i
]);
570 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
572 m4
= _mm_mul_ps(m1
, m5
);
573 m8
= _mm_mul_ps(m6
, m7
);
574 m3
= _mm_add_ps(m4
, m8
);
576 _mm_store_ps(&time_out
[nflat_ls
+4*nshort
+i
], m3
);
578 for (i
= 0; i
< nshort
; i
+=4)
580 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
581 m1
= _mm_load_ps(&transf_buf
[nshort
*9+i
]);
582 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
583 m6
= _mm_load_ps(&transf_buf
[nshort
*10+i
]);
584 m7
= _mm_load_ps(&window_short
[i
]);
586 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
588 m4
= _mm_mul_ps(m1
, m5
);
589 m8
= _mm_mul_ps(m6
, m7
);
590 m3
= _mm_add_ps(m4
, m8
);
592 _mm_store_ps(&time_out
[nflat_ls
+5*nshort
+i
], m3
);
594 for (i
= 0; i
< nshort
; i
+=4)
596 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
597 m1
= _mm_load_ps(&transf_buf
[nshort
*11+i
]);
598 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
599 m6
= _mm_load_ps(&transf_buf
[nshort
*12+i
]);
600 m7
= _mm_load_ps(&window_short
[i
]);
602 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
604 m4
= _mm_mul_ps(m1
, m5
);
605 m8
= _mm_mul_ps(m6
, m7
);
606 m3
= _mm_add_ps(m4
, m8
);
608 _mm_store_ps(&time_out
[nflat_ls
+6*nshort
+i
], m3
);
610 for (i
= 0; i
< nshort
; i
+=4)
612 __m128 m1
, m2
, m3
, m4
, m5
, m6
, m7
, m8
;
613 m1
= _mm_load_ps(&transf_buf
[nshort
*13+i
]);
614 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
615 m6
= _mm_load_ps(&transf_buf
[nshort
*14+i
]);
616 m7
= _mm_load_ps(&window_short
[i
]);
618 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
620 m4
= _mm_mul_ps(m1
, m5
);
621 m8
= _mm_mul_ps(m6
, m7
);
622 m3
= _mm_add_ps(m4
, m8
);
624 _mm_store_ps(&time_out
[nflat_ls
+7*nshort
+i
], m3
);
626 for (i
= 0; i
< nshort
; i
+=4)
628 __m128 m1
, m2
, m3
, m5
;
629 m1
= _mm_load_ps(&transf_buf
[nshort
*15+i
]);
630 m2
= _mm_load_ps(&window_short
[nshort
-4-i
]);
632 m5
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
634 m3
= _mm_mul_ps(m1
, m5
);
636 _mm_store_ps(&time_out
[nflat_ls
+8*nshort
+i
], m3
);
638 for (i
= 0; i
< nflat_ls
; i
+=4)
640 __m128 m1
= _mm_setzero_ps();
641 _mm_store_ps(&time_out
[nlong
+nflat_ls
+nshort
+i
], m1
);
645 case LONG_STOP_SEQUENCE
:
646 imdct_long_sse(fb
, freq_in
, transf_buf
, 2*nlong
);
647 for (i
= 0; i
< nflat_ls
; i
+=4)
649 __m128 m1
= _mm_load_ps(&time_out
[nlong
+i
]);
650 _mm_store_ps(&time_out
[i
], m1
);
652 for (i
= 0; i
< nshort
; i
+=4)
654 __m128 m1
= _mm_load_ps(&transf_buf
[nflat_ls
+i
]);
655 __m128 m2
= _mm_load_ps(&window_short_prev
[i
]);
656 __m128 m3
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+i
]);
658 __m128 m4
= _mm_mul_ps(m1
, m2
);
659 m4
= _mm_add_ps(m4
, m3
);
661 _mm_store_ps(&time_out
[nflat_ls
+i
], m4
);
663 for (i
= 0; i
< nflat_ls
; i
+=4)
665 __m128 m1
= _mm_load_ps(&transf_buf
[nflat_ls
+nshort
+i
]);
666 __m128 m2
= _mm_load_ps(&time_out
[nlong
+nflat_ls
+nshort
+i
]);
668 __m128 m3
= _mm_add_ps(m1
, m2
);
670 _mm_store_ps(&time_out
[nflat_ls
+nshort
+i
], m3
);
672 for (i
= 0; i
< nlong
; i
+=4)
674 __m128 m1
= _mm_load_ps(&transf_buf
[nlong
+i
]);
675 __m128 m2
= _mm_load_ps(&window_long
[nlong
-4-i
]);
678 m3
= _mm_shuffle_ps(m2
, m2
, _MM_SHUFFLE(0, 1, 2, 3));
680 m4
= _mm_mul_ps(m1
, m3
);
682 _mm_store_ps(&time_out
[nlong
+i
], m4
);
688 count
= faad_get_ts() - count
;
695 /* only works for LTP -> no overlapping, no short blocks */
696 void filter_bank_ltp(fb_info
*fb
, uint8_t window_sequence
, uint8_t window_shape
,
697 uint8_t window_shape_prev
, real_t
*in_data
, real_t
*out_mdct
,
698 uint8_t object_type
, uint16_t frame_len
)
701 ALIGN real_t windowed_buf
[2*1024] = {0};
703 const real_t
*window_long
= NULL
;
704 const real_t
*window_long_prev
= NULL
;
705 const real_t
*window_short
= NULL
;
706 const real_t
*window_short_prev
= NULL
;
708 uint16_t nlong
= frame_len
;
709 uint16_t nshort
= frame_len
/8;
710 uint16_t nflat_ls
= (nlong
-nshort
)/2;
712 assert(window_sequence
!= EIGHT_SHORT_SEQUENCE
);
715 if (object_type
== LD
)
717 window_long
= fb
->ld_window
[window_shape
];
718 window_long_prev
= fb
->ld_window
[window_shape_prev
];
721 window_long
= fb
->long_window
[window_shape
];
722 window_long_prev
= fb
->long_window
[window_shape_prev
];
723 window_short
= fb
->short_window
[window_shape
];
724 window_short_prev
= fb
->short_window
[window_shape_prev
];
729 switch(window_sequence
)
731 case ONLY_LONG_SEQUENCE
:
732 for (i
= nlong
-1; i
>= 0; i
--)
734 windowed_buf
[i
] = MUL_F(in_data
[i
], window_long_prev
[i
]);
735 windowed_buf
[i
+nlong
] = MUL_F(in_data
[i
+nlong
], window_long
[nlong
-1-i
]);
737 mdct(fb
, windowed_buf
, out_mdct
, 2*nlong
);
740 case LONG_START_SEQUENCE
:
741 for (i
= 0; i
< nlong
; i
++)
742 windowed_buf
[i
] = MUL_F(in_data
[i
], window_long_prev
[i
]);
743 for (i
= 0; i
< nflat_ls
; i
++)
744 windowed_buf
[i
+nlong
] = in_data
[i
+nlong
];
745 for (i
= 0; i
< nshort
; i
++)
746 windowed_buf
[i
+nlong
+nflat_ls
] = MUL_F(in_data
[i
+nlong
+nflat_ls
], window_short
[nshort
-1-i
]);
747 for (i
= 0; i
< nflat_ls
; i
++)
748 windowed_buf
[i
+nlong
+nflat_ls
+nshort
] = 0;
749 mdct(fb
, windowed_buf
, out_mdct
, 2*nlong
);
752 case LONG_STOP_SEQUENCE
:
753 for (i
= 0; i
< nflat_ls
; i
++)
755 for (i
= 0; i
< nshort
; i
++)
756 windowed_buf
[i
+nflat_ls
] = MUL_F(in_data
[i
+nflat_ls
], window_short_prev
[i
]);
757 for (i
= 0; i
< nflat_ls
; i
++)
758 windowed_buf
[i
+nflat_ls
+nshort
] = in_data
[i
+nflat_ls
+nshort
];
759 for (i
= 0; i
< nlong
; i
++)
760 windowed_buf
[i
+nlong
] = MUL_F(in_data
[i
+nlong
], window_long
[nlong
-1-i
]);
761 mdct(fb
, windowed_buf
, out_mdct
, 2*nlong
);