1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #include "private/cpu.h"
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE_SUPPORTED
44 #include "FLAC/assert.h"
45 #include "FLAC/format.h"
47 #include <xmmintrin.h> /* SSE */
49 /* new routines: more unaligned loads, less shuffle
50 * old routines: less unaligned loads, more shuffle
51 * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
54 /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
56 FLAC__SSE_TARGET("sse")
57 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
60 int limit
= data_len
- 4;
64 FLAC__ASSERT(lag
<= 4);
65 FLAC__ASSERT(lag
<= data_len
);
67 sum0
= _mm_setzero_ps();
69 for(i
= 0; i
<= limit
; i
++) {
71 d0
= _mm_loadu_ps(data
+i
);
72 d
= _mm_shuffle_ps(d0
, d0
, 0);
73 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d0
, d
));
77 __m128 d0
= _mm_setzero_ps();
78 limit
++; if(limit
< 0) limit
= 0;
80 for(i
= data_len
-1; i
>= limit
; i
--) {
82 d
= _mm_load_ss(data
+i
); d
= _mm_shuffle_ps(d
, d
, 0);
83 d0
= _mm_shuffle_ps(d0
, d0
, _MM_SHUFFLE(2,1,0,3));
84 d0
= _mm_move_ss(d0
, d
);
85 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d
, d0
));
89 _mm_storeu_ps(autoc
, sum0
);
92 FLAC__SSE_TARGET("sse")
93 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
96 int limit
= data_len
- 8;
100 FLAC__ASSERT(lag
<= 8);
101 FLAC__ASSERT(lag
<= data_len
);
103 sum0
= _mm_setzero_ps();
104 sum1
= _mm_setzero_ps();
106 for(i
= 0; i
<= limit
; i
++) {
108 d0
= _mm_loadu_ps(data
+i
);
109 d1
= _mm_loadu_ps(data
+i
+4);
110 d
= _mm_shuffle_ps(d0
, d0
, 0);
111 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d0
, d
));
112 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d1
, d
));
116 __m128 d0
= _mm_setzero_ps();
117 __m128 d1
= _mm_setzero_ps();
118 limit
++; if(limit
< 0) limit
= 0;
120 for(i
= data_len
-1; i
>= limit
; i
--) {
122 d
= _mm_load_ss(data
+i
); d
= _mm_shuffle_ps(d
, d
, 0);
123 d1
= _mm_shuffle_ps(d1
, d1
, _MM_SHUFFLE(2,1,0,3));
124 d0
= _mm_shuffle_ps(d0
, d0
, _MM_SHUFFLE(2,1,0,3));
125 d1
= _mm_move_ss(d1
, d0
);
126 d0
= _mm_move_ss(d0
, d
);
127 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d
, d1
));
128 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d
, d0
));
132 _mm_storeu_ps(autoc
, sum0
);
133 _mm_storeu_ps(autoc
+4, sum1
);
136 FLAC__SSE_TARGET("sse")
137 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
140 int limit
= data_len
- 12;
141 __m128 sum0
, sum1
, sum2
;
144 FLAC__ASSERT(lag
<= 12);
145 FLAC__ASSERT(lag
<= data_len
);
147 sum0
= _mm_setzero_ps();
148 sum1
= _mm_setzero_ps();
149 sum2
= _mm_setzero_ps();
151 for(i
= 0; i
<= limit
; i
++) {
152 __m128 d
, d0
, d1
, d2
;
153 d0
= _mm_loadu_ps(data
+i
);
154 d1
= _mm_loadu_ps(data
+i
+4);
155 d2
= _mm_loadu_ps(data
+i
+8);
156 d
= _mm_shuffle_ps(d0
, d0
, 0);
157 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d0
, d
));
158 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d1
, d
));
159 sum2
= _mm_add_ps(sum2
, _mm_mul_ps(d2
, d
));
163 __m128 d0
= _mm_setzero_ps();
164 __m128 d1
= _mm_setzero_ps();
165 __m128 d2
= _mm_setzero_ps();
166 limit
++; if(limit
< 0) limit
= 0;
168 for(i
= data_len
-1; i
>= limit
; i
--) {
170 d
= _mm_load_ss(data
+i
); d
= _mm_shuffle_ps(d
, d
, 0);
171 d2
= _mm_shuffle_ps(d2
, d2
, _MM_SHUFFLE(2,1,0,3));
172 d1
= _mm_shuffle_ps(d1
, d1
, _MM_SHUFFLE(2,1,0,3));
173 d0
= _mm_shuffle_ps(d0
, d0
, _MM_SHUFFLE(2,1,0,3));
174 d2
= _mm_move_ss(d2
, d1
);
175 d1
= _mm_move_ss(d1
, d0
);
176 d0
= _mm_move_ss(d0
, d
);
177 sum2
= _mm_add_ps(sum2
, _mm_mul_ps(d
, d2
));
178 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d
, d1
));
179 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d
, d0
));
183 _mm_storeu_ps(autoc
, sum0
);
184 _mm_storeu_ps(autoc
+4, sum1
);
185 _mm_storeu_ps(autoc
+8, sum2
);
188 FLAC__SSE_TARGET("sse")
189 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
192 int limit
= data_len
- 16;
193 __m128 sum0
, sum1
, sum2
, sum3
;
196 FLAC__ASSERT(lag
<= 16);
197 FLAC__ASSERT(lag
<= data_len
);
199 sum0
= _mm_setzero_ps();
200 sum1
= _mm_setzero_ps();
201 sum2
= _mm_setzero_ps();
202 sum3
= _mm_setzero_ps();
204 for(i
= 0; i
<= limit
; i
++) {
205 __m128 d
, d0
, d1
, d2
, d3
;
206 d0
= _mm_loadu_ps(data
+i
);
207 d1
= _mm_loadu_ps(data
+i
+4);
208 d2
= _mm_loadu_ps(data
+i
+8);
209 d3
= _mm_loadu_ps(data
+i
+12);
210 d
= _mm_shuffle_ps(d0
, d0
, 0);
211 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d0
, d
));
212 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d1
, d
));
213 sum2
= _mm_add_ps(sum2
, _mm_mul_ps(d2
, d
));
214 sum3
= _mm_add_ps(sum3
, _mm_mul_ps(d3
, d
));
218 __m128 d0
= _mm_setzero_ps();
219 __m128 d1
= _mm_setzero_ps();
220 __m128 d2
= _mm_setzero_ps();
221 __m128 d3
= _mm_setzero_ps();
222 limit
++; if(limit
< 0) limit
= 0;
224 for(i
= data_len
-1; i
>= limit
; i
--) {
226 d
= _mm_load_ss(data
+i
); d
= _mm_shuffle_ps(d
, d
, 0);
227 d3
= _mm_shuffle_ps(d3
, d3
, _MM_SHUFFLE(2,1,0,3));
228 d2
= _mm_shuffle_ps(d2
, d2
, _MM_SHUFFLE(2,1,0,3));
229 d1
= _mm_shuffle_ps(d1
, d1
, _MM_SHUFFLE(2,1,0,3));
230 d0
= _mm_shuffle_ps(d0
, d0
, _MM_SHUFFLE(2,1,0,3));
231 d3
= _mm_move_ss(d3
, d2
);
232 d2
= _mm_move_ss(d2
, d1
);
233 d1
= _mm_move_ss(d1
, d0
);
234 d0
= _mm_move_ss(d0
, d
);
235 sum3
= _mm_add_ps(sum3
, _mm_mul_ps(d
, d3
));
236 sum2
= _mm_add_ps(sum2
, _mm_mul_ps(d
, d2
));
237 sum1
= _mm_add_ps(sum1
, _mm_mul_ps(d
, d1
));
238 sum0
= _mm_add_ps(sum0
, _mm_mul_ps(d
, d0
));
242 _mm_storeu_ps(autoc
, sum0
);
243 _mm_storeu_ps(autoc
+4, sum1
);
244 _mm_storeu_ps(autoc
+8, sum2
);
245 _mm_storeu_ps(autoc
+12,sum3
);
248 /* old routines: faster on older Intel CPUs (up to Core 2) */
250 FLAC__SSE_TARGET("sse")
251 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
253 __m128 xmm0
, xmm2
, xmm5
;
256 FLAC__ASSERT(lag
> 0);
257 FLAC__ASSERT(lag
<= 4);
258 FLAC__ASSERT(lag
<= data_len
);
259 FLAC__ASSERT(data_len
> 0);
261 xmm5
= _mm_setzero_ps();
263 xmm0
= _mm_load_ss(data
++);
265 xmm0
= _mm_shuffle_ps(xmm0
, xmm0
, 0);
267 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
268 xmm5
= _mm_add_ps(xmm5
, xmm0
);
274 xmm0
= _mm_load1_ps(data
++);
276 xmm2
= _mm_shuffle_ps(xmm2
, xmm2
, _MM_SHUFFLE(2,1,0,3));
277 xmm2
= _mm_move_ss(xmm2
, xmm0
);
278 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
279 xmm5
= _mm_add_ps(xmm5
, xmm0
);
284 _mm_storeu_ps(autoc
, xmm5
);
287 FLAC__SSE_TARGET("sse")
288 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
290 __m128 xmm0
, xmm1
, xmm2
, xmm3
, xmm5
, xmm6
;
293 FLAC__ASSERT(lag
> 0);
294 FLAC__ASSERT(lag
<= 8);
295 FLAC__ASSERT(lag
<= data_len
);
296 FLAC__ASSERT(data_len
> 0);
298 xmm5
= _mm_setzero_ps();
299 xmm6
= _mm_setzero_ps();
301 xmm0
= _mm_load_ss(data
++);
303 xmm0
= _mm_shuffle_ps(xmm0
, xmm0
, 0);
304 xmm3
= _mm_setzero_ps();
306 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
307 xmm5
= _mm_add_ps(xmm5
, xmm0
);
313 xmm0
= _mm_load1_ps(data
++);
315 xmm2
= _mm_shuffle_ps(xmm2
, xmm2
, _MM_SHUFFLE(2,1,0,3));
316 xmm3
= _mm_shuffle_ps(xmm3
, xmm3
, _MM_SHUFFLE(2,1,0,3));
317 xmm3
= _mm_move_ss(xmm3
, xmm2
);
318 xmm2
= _mm_move_ss(xmm2
, xmm0
);
321 xmm1
= _mm_mul_ps(xmm1
, xmm3
);
322 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
323 xmm6
= _mm_add_ps(xmm6
, xmm1
);
324 xmm5
= _mm_add_ps(xmm5
, xmm0
);
329 _mm_storeu_ps(autoc
, xmm5
);
330 _mm_storeu_ps(autoc
+4, xmm6
);
333 FLAC__SSE_TARGET("sse")
334 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
336 __m128 xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
339 FLAC__ASSERT(lag
> 0);
340 FLAC__ASSERT(lag
<= 12);
341 FLAC__ASSERT(lag
<= data_len
);
342 FLAC__ASSERT(data_len
> 0);
344 xmm5
= _mm_setzero_ps();
345 xmm6
= _mm_setzero_ps();
346 xmm7
= _mm_setzero_ps();
348 xmm0
= _mm_load_ss(data
++);
350 xmm0
= _mm_shuffle_ps(xmm0
, xmm0
, 0);
351 xmm3
= _mm_setzero_ps();
352 xmm4
= _mm_setzero_ps();
354 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
355 xmm5
= _mm_add_ps(xmm5
, xmm0
);
361 xmm0
= _mm_load1_ps(data
++);
363 xmm2
= _mm_shuffle_ps(xmm2
, xmm2
, _MM_SHUFFLE(2,1,0,3));
364 xmm3
= _mm_shuffle_ps(xmm3
, xmm3
, _MM_SHUFFLE(2,1,0,3));
365 xmm4
= _mm_shuffle_ps(xmm4
, xmm4
, _MM_SHUFFLE(2,1,0,3));
366 xmm4
= _mm_move_ss(xmm4
, xmm3
);
367 xmm3
= _mm_move_ss(xmm3
, xmm2
);
368 xmm2
= _mm_move_ss(xmm2
, xmm0
);
371 xmm1
= _mm_mul_ps(xmm1
, xmm2
);
372 xmm5
= _mm_add_ps(xmm5
, xmm1
);
374 xmm1
= _mm_mul_ps(xmm1
, xmm3
);
375 xmm6
= _mm_add_ps(xmm6
, xmm1
);
376 xmm0
= _mm_mul_ps(xmm0
, xmm4
);
377 xmm7
= _mm_add_ps(xmm7
, xmm0
);
382 _mm_storeu_ps(autoc
, xmm5
);
383 _mm_storeu_ps(autoc
+4, xmm6
);
384 _mm_storeu_ps(autoc
+8, xmm7
);
387 FLAC__SSE_TARGET("sse")
388 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data
[], uint32_t data_len
, uint32_t lag
, FLAC__real autoc
[])
390 __m128 xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
, xmm8
, xmm9
;
393 FLAC__ASSERT(lag
> 0);
394 FLAC__ASSERT(lag
<= 16);
395 FLAC__ASSERT(lag
<= data_len
);
396 FLAC__ASSERT(data_len
> 0);
398 xmm6
= _mm_setzero_ps();
399 xmm7
= _mm_setzero_ps();
400 xmm8
= _mm_setzero_ps();
401 xmm9
= _mm_setzero_ps();
403 xmm0
= _mm_load_ss(data
++);
405 xmm0
= _mm_shuffle_ps(xmm0
, xmm0
, 0);
406 xmm3
= _mm_setzero_ps();
407 xmm4
= _mm_setzero_ps();
408 xmm5
= _mm_setzero_ps();
410 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
411 xmm6
= _mm_add_ps(xmm6
, xmm0
);
417 xmm0
= _mm_load1_ps(data
++);
419 /* shift xmm5:xmm4:xmm3:xmm2 left by one float */
420 xmm5
= _mm_shuffle_ps(xmm5
, xmm5
, _MM_SHUFFLE(2,1,0,3));
421 xmm4
= _mm_shuffle_ps(xmm4
, xmm4
, _MM_SHUFFLE(2,1,0,3));
422 xmm3
= _mm_shuffle_ps(xmm3
, xmm3
, _MM_SHUFFLE(2,1,0,3));
423 xmm2
= _mm_shuffle_ps(xmm2
, xmm2
, _MM_SHUFFLE(2,1,0,3));
424 xmm5
= _mm_move_ss(xmm5
, xmm4
);
425 xmm4
= _mm_move_ss(xmm4
, xmm3
);
426 xmm3
= _mm_move_ss(xmm3
, xmm2
);
427 xmm2
= _mm_move_ss(xmm2
, xmm0
);
429 /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
431 xmm1
= _mm_mul_ps(xmm1
, xmm5
);
432 xmm9
= _mm_add_ps(xmm9
, xmm1
);
434 xmm1
= _mm_mul_ps(xmm1
, xmm4
);
435 xmm8
= _mm_add_ps(xmm8
, xmm1
);
437 xmm1
= _mm_mul_ps(xmm1
, xmm3
);
438 xmm7
= _mm_add_ps(xmm7
, xmm1
);
439 xmm0
= _mm_mul_ps(xmm0
, xmm2
);
440 xmm6
= _mm_add_ps(xmm6
, xmm0
);
445 _mm_storeu_ps(autoc
, xmm6
);
446 _mm_storeu_ps(autoc
+4, xmm7
);
447 _mm_storeu_ps(autoc
+8, xmm8
);
448 _mm_storeu_ps(autoc
+12,xmm9
);
451 #endif /* FLAC__SSE_SUPPORTED */
452 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
453 #endif /* FLAC__NO_ASM */
454 #endif /* FLAC__INTEGER_ONLY_LIBRARY */