Integrate Github Actions
[flac.git] / src / libFLAC / lpc_intrin_sse.c
blob8c7902f9d5bcac1ce2152d64ee013897997321aa
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
37 #include "private/cpu.h"
39 #ifndef FLAC__INTEGER_ONLY_LIBRARY
40 #ifndef FLAC__NO_ASM
41 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN
42 #include "private/lpc.h"
43 #ifdef FLAC__SSE_SUPPORTED
44 #include "FLAC/assert.h"
45 #include "FLAC/format.h"
47 #include <xmmintrin.h> /* SSE */
49 /* new routines: more unaligned loads, less shuffle
50 * old routines: less unaligned loads, more shuffle
51 * these *_old routines are equivalent to the ASM routines in ia32/lpc_asm.nasm
54 /* new routines: faster on current Intel (starting from Core i aka Nehalem) and all AMD CPUs */
56 FLAC__SSE_TARGET("sse")
57 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_new(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
59 int i;
60 int limit = data_len - 4;
61 __m128 sum0;
63 (void) lag;
64 FLAC__ASSERT(lag <= 4);
65 FLAC__ASSERT(lag <= data_len);
67 sum0 = _mm_setzero_ps();
69 for(i = 0; i <= limit; i++) {
70 __m128 d, d0;
71 d0 = _mm_loadu_ps(data+i);
72 d = _mm_shuffle_ps(d0, d0, 0);
73 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
77 __m128 d0 = _mm_setzero_ps();
78 limit++; if(limit < 0) limit = 0;
80 for(i = data_len-1; i >= limit; i--) {
81 __m128 d;
82 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
83 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
84 d0 = _mm_move_ss(d0, d);
85 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
89 _mm_storeu_ps(autoc, sum0);
92 FLAC__SSE_TARGET("sse")
93 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_new(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
95 int i;
96 int limit = data_len - 8;
97 __m128 sum0, sum1;
99 (void) lag;
100 FLAC__ASSERT(lag <= 8);
101 FLAC__ASSERT(lag <= data_len);
103 sum0 = _mm_setzero_ps();
104 sum1 = _mm_setzero_ps();
106 for(i = 0; i <= limit; i++) {
107 __m128 d, d0, d1;
108 d0 = _mm_loadu_ps(data+i);
109 d1 = _mm_loadu_ps(data+i+4);
110 d = _mm_shuffle_ps(d0, d0, 0);
111 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
112 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
116 __m128 d0 = _mm_setzero_ps();
117 __m128 d1 = _mm_setzero_ps();
118 limit++; if(limit < 0) limit = 0;
120 for(i = data_len-1; i >= limit; i--) {
121 __m128 d;
122 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
123 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
124 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
125 d1 = _mm_move_ss(d1, d0);
126 d0 = _mm_move_ss(d0, d);
127 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
128 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
132 _mm_storeu_ps(autoc, sum0);
133 _mm_storeu_ps(autoc+4, sum1);
136 FLAC__SSE_TARGET("sse")
137 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_new(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
139 int i;
140 int limit = data_len - 12;
141 __m128 sum0, sum1, sum2;
143 (void) lag;
144 FLAC__ASSERT(lag <= 12);
145 FLAC__ASSERT(lag <= data_len);
147 sum0 = _mm_setzero_ps();
148 sum1 = _mm_setzero_ps();
149 sum2 = _mm_setzero_ps();
151 for(i = 0; i <= limit; i++) {
152 __m128 d, d0, d1, d2;
153 d0 = _mm_loadu_ps(data+i);
154 d1 = _mm_loadu_ps(data+i+4);
155 d2 = _mm_loadu_ps(data+i+8);
156 d = _mm_shuffle_ps(d0, d0, 0);
157 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
158 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
159 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
163 __m128 d0 = _mm_setzero_ps();
164 __m128 d1 = _mm_setzero_ps();
165 __m128 d2 = _mm_setzero_ps();
166 limit++; if(limit < 0) limit = 0;
168 for(i = data_len-1; i >= limit; i--) {
169 __m128 d;
170 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
171 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
172 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
173 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
174 d2 = _mm_move_ss(d2, d1);
175 d1 = _mm_move_ss(d1, d0);
176 d0 = _mm_move_ss(d0, d);
177 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
178 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
179 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
183 _mm_storeu_ps(autoc, sum0);
184 _mm_storeu_ps(autoc+4, sum1);
185 _mm_storeu_ps(autoc+8, sum2);
188 FLAC__SSE_TARGET("sse")
189 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_new(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
191 int i;
192 int limit = data_len - 16;
193 __m128 sum0, sum1, sum2, sum3;
195 (void) lag;
196 FLAC__ASSERT(lag <= 16);
197 FLAC__ASSERT(lag <= data_len);
199 sum0 = _mm_setzero_ps();
200 sum1 = _mm_setzero_ps();
201 sum2 = _mm_setzero_ps();
202 sum3 = _mm_setzero_ps();
204 for(i = 0; i <= limit; i++) {
205 __m128 d, d0, d1, d2, d3;
206 d0 = _mm_loadu_ps(data+i);
207 d1 = _mm_loadu_ps(data+i+4);
208 d2 = _mm_loadu_ps(data+i+8);
209 d3 = _mm_loadu_ps(data+i+12);
210 d = _mm_shuffle_ps(d0, d0, 0);
211 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d0, d));
212 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d1, d));
213 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d2, d));
214 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d3, d));
218 __m128 d0 = _mm_setzero_ps();
219 __m128 d1 = _mm_setzero_ps();
220 __m128 d2 = _mm_setzero_ps();
221 __m128 d3 = _mm_setzero_ps();
222 limit++; if(limit < 0) limit = 0;
224 for(i = data_len-1; i >= limit; i--) {
225 __m128 d;
226 d = _mm_load_ss(data+i); d = _mm_shuffle_ps(d, d, 0);
227 d3 = _mm_shuffle_ps(d3, d3, _MM_SHUFFLE(2,1,0,3));
228 d2 = _mm_shuffle_ps(d2, d2, _MM_SHUFFLE(2,1,0,3));
229 d1 = _mm_shuffle_ps(d1, d1, _MM_SHUFFLE(2,1,0,3));
230 d0 = _mm_shuffle_ps(d0, d0, _MM_SHUFFLE(2,1,0,3));
231 d3 = _mm_move_ss(d3, d2);
232 d2 = _mm_move_ss(d2, d1);
233 d1 = _mm_move_ss(d1, d0);
234 d0 = _mm_move_ss(d0, d);
235 sum3 = _mm_add_ps(sum3, _mm_mul_ps(d, d3));
236 sum2 = _mm_add_ps(sum2, _mm_mul_ps(d, d2));
237 sum1 = _mm_add_ps(sum1, _mm_mul_ps(d, d1));
238 sum0 = _mm_add_ps(sum0, _mm_mul_ps(d, d0));
242 _mm_storeu_ps(autoc, sum0);
243 _mm_storeu_ps(autoc+4, sum1);
244 _mm_storeu_ps(autoc+8, sum2);
245 _mm_storeu_ps(autoc+12,sum3);
248 /* old routines: faster on older Intel CPUs (up to Core 2) */
250 FLAC__SSE_TARGET("sse")
251 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_4_old(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
253 __m128 xmm0, xmm2, xmm5;
255 (void) lag;
256 FLAC__ASSERT(lag > 0);
257 FLAC__ASSERT(lag <= 4);
258 FLAC__ASSERT(lag <= data_len);
259 FLAC__ASSERT(data_len > 0);
261 xmm5 = _mm_setzero_ps();
263 xmm0 = _mm_load_ss(data++);
264 xmm2 = xmm0;
265 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
267 xmm0 = _mm_mul_ps(xmm0, xmm2);
268 xmm5 = _mm_add_ps(xmm5, xmm0);
270 data_len--;
272 while(data_len)
274 xmm0 = _mm_load1_ps(data++);
276 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
277 xmm2 = _mm_move_ss(xmm2, xmm0);
278 xmm0 = _mm_mul_ps(xmm0, xmm2);
279 xmm5 = _mm_add_ps(xmm5, xmm0);
281 data_len--;
284 _mm_storeu_ps(autoc, xmm5);
287 FLAC__SSE_TARGET("sse")
288 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_8_old(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
290 __m128 xmm0, xmm1, xmm2, xmm3, xmm5, xmm6;
292 (void) lag;
293 FLAC__ASSERT(lag > 0);
294 FLAC__ASSERT(lag <= 8);
295 FLAC__ASSERT(lag <= data_len);
296 FLAC__ASSERT(data_len > 0);
298 xmm5 = _mm_setzero_ps();
299 xmm6 = _mm_setzero_ps();
301 xmm0 = _mm_load_ss(data++);
302 xmm2 = xmm0;
303 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
304 xmm3 = _mm_setzero_ps();
306 xmm0 = _mm_mul_ps(xmm0, xmm2);
307 xmm5 = _mm_add_ps(xmm5, xmm0);
309 data_len--;
311 while(data_len)
313 xmm0 = _mm_load1_ps(data++);
315 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
316 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
317 xmm3 = _mm_move_ss(xmm3, xmm2);
318 xmm2 = _mm_move_ss(xmm2, xmm0);
320 xmm1 = xmm0;
321 xmm1 = _mm_mul_ps(xmm1, xmm3);
322 xmm0 = _mm_mul_ps(xmm0, xmm2);
323 xmm6 = _mm_add_ps(xmm6, xmm1);
324 xmm5 = _mm_add_ps(xmm5, xmm0);
326 data_len--;
329 _mm_storeu_ps(autoc, xmm5);
330 _mm_storeu_ps(autoc+4, xmm6);
333 FLAC__SSE_TARGET("sse")
334 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_12_old(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
336 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
338 (void) lag;
339 FLAC__ASSERT(lag > 0);
340 FLAC__ASSERT(lag <= 12);
341 FLAC__ASSERT(lag <= data_len);
342 FLAC__ASSERT(data_len > 0);
344 xmm5 = _mm_setzero_ps();
345 xmm6 = _mm_setzero_ps();
346 xmm7 = _mm_setzero_ps();
348 xmm0 = _mm_load_ss(data++);
349 xmm2 = xmm0;
350 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
351 xmm3 = _mm_setzero_ps();
352 xmm4 = _mm_setzero_ps();
354 xmm0 = _mm_mul_ps(xmm0, xmm2);
355 xmm5 = _mm_add_ps(xmm5, xmm0);
357 data_len--;
359 while(data_len)
361 xmm0 = _mm_load1_ps(data++);
363 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
364 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
365 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
366 xmm4 = _mm_move_ss(xmm4, xmm3);
367 xmm3 = _mm_move_ss(xmm3, xmm2);
368 xmm2 = _mm_move_ss(xmm2, xmm0);
370 xmm1 = xmm0;
371 xmm1 = _mm_mul_ps(xmm1, xmm2);
372 xmm5 = _mm_add_ps(xmm5, xmm1);
373 xmm1 = xmm0;
374 xmm1 = _mm_mul_ps(xmm1, xmm3);
375 xmm6 = _mm_add_ps(xmm6, xmm1);
376 xmm0 = _mm_mul_ps(xmm0, xmm4);
377 xmm7 = _mm_add_ps(xmm7, xmm0);
379 data_len--;
382 _mm_storeu_ps(autoc, xmm5);
383 _mm_storeu_ps(autoc+4, xmm6);
384 _mm_storeu_ps(autoc+8, xmm7);
387 FLAC__SSE_TARGET("sse")
388 void FLAC__lpc_compute_autocorrelation_intrin_sse_lag_16_old(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
390 __m128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9;
392 (void) lag;
393 FLAC__ASSERT(lag > 0);
394 FLAC__ASSERT(lag <= 16);
395 FLAC__ASSERT(lag <= data_len);
396 FLAC__ASSERT(data_len > 0);
398 xmm6 = _mm_setzero_ps();
399 xmm7 = _mm_setzero_ps();
400 xmm8 = _mm_setzero_ps();
401 xmm9 = _mm_setzero_ps();
403 xmm0 = _mm_load_ss(data++);
404 xmm2 = xmm0;
405 xmm0 = _mm_shuffle_ps(xmm0, xmm0, 0);
406 xmm3 = _mm_setzero_ps();
407 xmm4 = _mm_setzero_ps();
408 xmm5 = _mm_setzero_ps();
410 xmm0 = _mm_mul_ps(xmm0, xmm2);
411 xmm6 = _mm_add_ps(xmm6, xmm0);
413 data_len--;
415 while(data_len)
417 xmm0 = _mm_load1_ps(data++);
419 /* shift xmm5:xmm4:xmm3:xmm2 left by one float */
420 xmm5 = _mm_shuffle_ps(xmm5, xmm5, _MM_SHUFFLE(2,1,0,3));
421 xmm4 = _mm_shuffle_ps(xmm4, xmm4, _MM_SHUFFLE(2,1,0,3));
422 xmm3 = _mm_shuffle_ps(xmm3, xmm3, _MM_SHUFFLE(2,1,0,3));
423 xmm2 = _mm_shuffle_ps(xmm2, xmm2, _MM_SHUFFLE(2,1,0,3));
424 xmm5 = _mm_move_ss(xmm5, xmm4);
425 xmm4 = _mm_move_ss(xmm4, xmm3);
426 xmm3 = _mm_move_ss(xmm3, xmm2);
427 xmm2 = _mm_move_ss(xmm2, xmm0);
429 /* xmm9|xmm8|xmm7|xmm6 += xmm0|xmm0|xmm0|xmm0 * xmm5|xmm4|xmm3|xmm2 */
430 xmm1 = xmm0;
431 xmm1 = _mm_mul_ps(xmm1, xmm5);
432 xmm9 = _mm_add_ps(xmm9, xmm1);
433 xmm1 = xmm0;
434 xmm1 = _mm_mul_ps(xmm1, xmm4);
435 xmm8 = _mm_add_ps(xmm8, xmm1);
436 xmm1 = xmm0;
437 xmm1 = _mm_mul_ps(xmm1, xmm3);
438 xmm7 = _mm_add_ps(xmm7, xmm1);
439 xmm0 = _mm_mul_ps(xmm0, xmm2);
440 xmm6 = _mm_add_ps(xmm6, xmm0);
442 data_len--;
445 _mm_storeu_ps(autoc, xmm6);
446 _mm_storeu_ps(autoc+4, xmm7);
447 _mm_storeu_ps(autoc+8, xmm8);
448 _mm_storeu_ps(autoc+12,xmm9);
451 #endif /* FLAC__SSE_SUPPORTED */
452 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
453 #endif /* FLAC__NO_ASM */
454 #endif /* FLAC__INTEGER_ONLY_LIBRARY */