libFLAC/lpc_intrin_sseN.c : Disambiguate macro names.
[flac.git] / src / libFLAC / lpc_intrin_sse41.c
bloba89c0db5cd1369e30e7369f1e2f96de6f40c8250
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE4_1_SUPPORTED
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
46 #include <smmintrin.h> /* SSE4.1 */
48 #ifdef FLAC__CPU_IA32
49 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
50 #define DATA64_RESULT(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
51 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
52 #define DATA64_RESULT1(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
53 #else
54 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
55 #define DATA64_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
56 #define RESIDUAL64_RESULT1(xmmN) RESIDUAL64_RESULT(xmmN)
57 #define DATA64_RESULT1(xmmN) DATA64_RESULT(xmmN)
58 #endif
60 FLAC__SSE_TARGET("sse4.1")
61 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
63 int i;
64 #ifdef FLAC__CPU_IA32
65 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
66 #endif
68 FLAC__ASSERT(order > 0);
69 FLAC__ASSERT(order <= 32);
70 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
72 if(order <= 12) {
73 if(order > 8) { /* order == 9, 10, 11, 12 */
74 if(order > 10) { /* order == 11, 12 */
75 if(order == 12) {
76 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
77 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
78 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
79 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
80 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
81 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
82 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
84 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
85 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
86 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
87 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
88 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
89 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
91 for(i = 0; i < (int)data_len; i++) {
92 //sum = 0;
93 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
94 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
95 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
96 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
97 xmm7 = _mm_mul_epi32(xmm7, xmm5);
99 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
100 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
101 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
102 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
103 xmm6 = _mm_mul_epi32(xmm6, xmm4);
104 xmm7 = _mm_add_epi64(xmm7, xmm6);
106 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
107 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
108 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
109 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
110 xmm6 = _mm_mul_epi32(xmm6, xmm3);
111 xmm7 = _mm_add_epi64(xmm7, xmm6);
113 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
114 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
115 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
116 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
117 xmm6 = _mm_mul_epi32(xmm6, xmm2);
118 xmm7 = _mm_add_epi64(xmm7, xmm6);
120 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
121 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
122 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
123 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
124 xmm6 = _mm_mul_epi32(xmm6, xmm1);
125 xmm7 = _mm_add_epi64(xmm7, xmm6);
127 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
128 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
129 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
130 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
131 xmm6 = _mm_mul_epi32(xmm6, xmm0);
132 xmm7 = _mm_add_epi64(xmm7, xmm6);
134 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
135 RESIDUAL64_RESULT1(xmm7);
138 else { /* order == 11 */
139 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
140 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
141 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
142 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
143 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
144 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
145 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
147 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
148 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
149 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
150 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
151 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
153 for(i = 0; i < (int)data_len; i++) {
154 //sum = 0;
155 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
156 xmm7 = _mm_cvtsi32_si128(data[i-11]);
157 xmm7 = _mm_mul_epi32(xmm7, xmm5);
159 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
160 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
161 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
162 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
163 xmm6 = _mm_mul_epi32(xmm6, xmm4);
164 xmm7 = _mm_add_epi64(xmm7, xmm6);
166 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
167 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
168 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
169 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
170 xmm6 = _mm_mul_epi32(xmm6, xmm3);
171 xmm7 = _mm_add_epi64(xmm7, xmm6);
173 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
174 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
175 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
176 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
177 xmm6 = _mm_mul_epi32(xmm6, xmm2);
178 xmm7 = _mm_add_epi64(xmm7, xmm6);
180 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
181 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
182 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
183 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
184 xmm6 = _mm_mul_epi32(xmm6, xmm1);
185 xmm7 = _mm_add_epi64(xmm7, xmm6);
187 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
188 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
189 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
190 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
191 xmm6 = _mm_mul_epi32(xmm6, xmm0);
192 xmm7 = _mm_add_epi64(xmm7, xmm6);
194 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
195 RESIDUAL64_RESULT1(xmm7);
199 else { /* order == 9, 10 */
200 if(order == 10) {
201 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
202 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
203 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
204 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
205 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
206 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
208 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
209 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
210 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
211 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
212 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
214 for(i = 0; i < (int)data_len; i++) {
215 //sum = 0;
216 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
217 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
218 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
219 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
220 xmm7 = _mm_mul_epi32(xmm7, xmm4);
222 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
223 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
224 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
225 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
226 xmm6 = _mm_mul_epi32(xmm6, xmm3);
227 xmm7 = _mm_add_epi64(xmm7, xmm6);
229 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
230 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
231 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
232 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
233 xmm6 = _mm_mul_epi32(xmm6, xmm2);
234 xmm7 = _mm_add_epi64(xmm7, xmm6);
236 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
237 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
238 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
239 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
240 xmm6 = _mm_mul_epi32(xmm6, xmm1);
241 xmm7 = _mm_add_epi64(xmm7, xmm6);
243 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
244 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
245 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
246 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
247 xmm6 = _mm_mul_epi32(xmm6, xmm0);
248 xmm7 = _mm_add_epi64(xmm7, xmm6);
250 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
251 RESIDUAL64_RESULT(xmm7);
254 else { /* order == 9 */
255 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
256 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
257 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
258 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
259 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
260 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
262 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
263 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
264 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
265 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
267 for(i = 0; i < (int)data_len; i++) {
268 //sum = 0;
269 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
270 xmm7 = _mm_cvtsi32_si128(data[i-9]);
271 xmm7 = _mm_mul_epi32(xmm7, xmm4);
273 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
274 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
275 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
276 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
277 xmm6 = _mm_mul_epi32(xmm6, xmm3);
278 xmm7 = _mm_add_epi64(xmm7, xmm6);
280 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
281 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
282 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
283 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
284 xmm6 = _mm_mul_epi32(xmm6, xmm2);
285 xmm7 = _mm_add_epi64(xmm7, xmm6);
287 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
288 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
289 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
290 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
291 xmm6 = _mm_mul_epi32(xmm6, xmm1);
292 xmm7 = _mm_add_epi64(xmm7, xmm6);
294 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
295 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
296 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
297 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
298 xmm6 = _mm_mul_epi32(xmm6, xmm0);
299 xmm7 = _mm_add_epi64(xmm7, xmm6);
301 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
302 RESIDUAL64_RESULT(xmm7);
307 else if(order > 4) { /* order == 5, 6, 7, 8 */
308 if(order > 6) { /* order == 7, 8 */
309 if(order == 8) {
310 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
311 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
312 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
313 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
314 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
316 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
317 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
318 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
319 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
321 for(i = 0; i < (int)data_len; i++) {
322 //sum = 0;
323 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
324 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
325 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
326 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
327 xmm7 = _mm_mul_epi32(xmm7, xmm3);
329 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
330 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
331 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
332 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
333 xmm6 = _mm_mul_epi32(xmm6, xmm2);
334 xmm7 = _mm_add_epi64(xmm7, xmm6);
336 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
337 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
338 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
339 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
340 xmm6 = _mm_mul_epi32(xmm6, xmm1);
341 xmm7 = _mm_add_epi64(xmm7, xmm6);
343 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
344 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
345 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
346 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
347 xmm6 = _mm_mul_epi32(xmm6, xmm0);
348 xmm7 = _mm_add_epi64(xmm7, xmm6);
350 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
351 RESIDUAL64_RESULT(xmm7);
354 else { /* order == 7 */
355 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
356 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
357 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
358 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
359 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
361 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
362 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
363 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
365 for(i = 0; i < (int)data_len; i++) {
366 //sum = 0;
367 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
368 xmm7 = _mm_cvtsi32_si128(data[i-7]);
369 xmm7 = _mm_mul_epi32(xmm7, xmm3);
371 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
372 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
373 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
374 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
375 xmm6 = _mm_mul_epi32(xmm6, xmm2);
376 xmm7 = _mm_add_epi64(xmm7, xmm6);
378 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
379 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
380 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
381 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
382 xmm6 = _mm_mul_epi32(xmm6, xmm1);
383 xmm7 = _mm_add_epi64(xmm7, xmm6);
385 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
386 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
387 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
388 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
389 xmm6 = _mm_mul_epi32(xmm6, xmm0);
390 xmm7 = _mm_add_epi64(xmm7, xmm6);
392 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
393 RESIDUAL64_RESULT(xmm7);
397 else { /* order == 5, 6 */
398 if(order == 6) {
399 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
400 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
401 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
402 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
404 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
405 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
406 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
408 for(i = 0; i < (int)data_len; i++) {
409 //sum = 0;
410 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
411 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
412 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
413 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
414 xmm7 = _mm_mul_epi32(xmm7, xmm2);
416 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
417 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
418 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
419 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
420 xmm6 = _mm_mul_epi32(xmm6, xmm1);
421 xmm7 = _mm_add_epi64(xmm7, xmm6);
423 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
424 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
425 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
426 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
427 xmm6 = _mm_mul_epi32(xmm6, xmm0);
428 xmm7 = _mm_add_epi64(xmm7, xmm6);
430 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
431 RESIDUAL64_RESULT(xmm7);
434 else { /* order == 5 */
435 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
436 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
437 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
438 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
440 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
441 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
443 for(i = 0; i < (int)data_len; i++) {
444 //sum = 0;
445 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
446 xmm7 = _mm_cvtsi32_si128(data[i-5]);
447 xmm7 = _mm_mul_epi32(xmm7, xmm2);
449 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
450 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
451 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
452 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
453 xmm6 = _mm_mul_epi32(xmm6, xmm1);
454 xmm7 = _mm_add_epi64(xmm7, xmm6);
456 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
457 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
458 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
459 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
460 xmm6 = _mm_mul_epi32(xmm6, xmm0);
461 xmm7 = _mm_add_epi64(xmm7, xmm6);
463 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
464 RESIDUAL64_RESULT(xmm7);
469 else { /* order == 1, 2, 3, 4 */
470 if(order > 2) { /* order == 3, 4 */
471 if(order == 4) {
472 __m128i xmm0, xmm1, xmm6, xmm7;
473 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
474 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
476 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
477 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
479 for(i = 0; i < (int)data_len; i++) {
480 //sum = 0;
481 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
482 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
483 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
484 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
485 xmm7 = _mm_mul_epi32(xmm7, xmm1);
487 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
488 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
489 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
490 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
491 xmm6 = _mm_mul_epi32(xmm6, xmm0);
492 xmm7 = _mm_add_epi64(xmm7, xmm6);
494 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
495 RESIDUAL64_RESULT(xmm7);
498 else { /* order == 3 */
499 __m128i xmm0, xmm1, xmm6, xmm7;
500 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
501 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
503 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
505 for(i = 0; i < (int)data_len; i++) {
506 //sum = 0;
507 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
508 xmm7 = _mm_cvtsi32_si128(data[i-3]);
509 xmm7 = _mm_mul_epi32(xmm7, xmm1);
511 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
512 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
513 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
514 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
515 xmm6 = _mm_mul_epi32(xmm6, xmm0);
516 xmm7 = _mm_add_epi64(xmm7, xmm6);
518 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
519 RESIDUAL64_RESULT(xmm7);
523 else { /* order == 1, 2 */
524 if(order == 2) {
525 __m128i xmm0, xmm7;
526 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
527 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
529 for(i = 0; i < (int)data_len; i++) {
530 //sum = 0;
531 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
532 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
533 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
534 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
535 xmm7 = _mm_mul_epi32(xmm7, xmm0);
537 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
538 RESIDUAL64_RESULT(xmm7);
541 else { /* order == 1 */
542 __m128i xmm0, xmm7;
543 xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
545 for(i = 0; i < (int)data_len; i++) {
546 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
547 xmm7 = _mm_cvtsi32_si128(data[i-1]);
548 xmm7 = _mm_mul_epi32(xmm7, xmm0);
549 RESIDUAL64_RESULT(xmm7);
555 else { /* order > 12 */
556 FLAC__int64 sum;
557 for(i = 0; i < (int)data_len; i++) {
558 sum = 0;
559 switch(order) {
560 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
561 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
562 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
563 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
564 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
565 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
566 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
567 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
568 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
569 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
570 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
571 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
572 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
573 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
574 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
575 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
576 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
577 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
578 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
579 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
580 sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
581 sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
582 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
583 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
584 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
585 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
586 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
587 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
588 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
589 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
590 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
591 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
593 residual[i] = data[i] - (FLAC__int32)(sum >> lp_quantization);
598 FLAC__SSE_TARGET("sse4.1")
599 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
601 int i;
602 #ifdef FLAC__CPU_IA32
603 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
604 #endif
606 FLAC__ASSERT(order > 0);
607 FLAC__ASSERT(order <= 32);
608 FLAC__ASSERT(lp_quantization <= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
610 if(order <= 12) {
611 if(order > 8) { /* order == 9, 10, 11, 12 */
612 if(order > 10) { /* order == 11, 12 */
613 if(order == 12) {
614 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
615 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
616 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
617 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
618 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
619 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
620 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
622 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
623 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
624 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
625 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
626 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
627 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
629 for(i = 0; i < (int)data_len; i++) {
630 //sum = 0;
631 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
632 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
633 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
634 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
635 xmm7 = _mm_mul_epi32(xmm7, xmm5);
637 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
638 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
639 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
640 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
641 xmm6 = _mm_mul_epi32(xmm6, xmm4);
642 xmm7 = _mm_add_epi64(xmm7, xmm6);
644 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
645 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
646 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
647 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
648 xmm6 = _mm_mul_epi32(xmm6, xmm3);
649 xmm7 = _mm_add_epi64(xmm7, xmm6);
651 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
652 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
653 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
654 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
655 xmm6 = _mm_mul_epi32(xmm6, xmm2);
656 xmm7 = _mm_add_epi64(xmm7, xmm6);
658 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
659 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
660 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
661 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
662 xmm6 = _mm_mul_epi32(xmm6, xmm1);
663 xmm7 = _mm_add_epi64(xmm7, xmm6);
665 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
666 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
667 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
668 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
669 xmm6 = _mm_mul_epi32(xmm6, xmm0);
670 xmm7 = _mm_add_epi64(xmm7, xmm6);
672 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
673 DATA64_RESULT1(xmm7);
676 else { /* order == 11 */
677 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
678 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
679 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
680 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
681 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
682 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
683 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
685 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
686 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
687 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
688 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
689 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
691 for(i = 0; i < (int)data_len; i++) {
692 //sum = 0;
693 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
694 xmm7 = _mm_cvtsi32_si128(data[i-11]);
695 xmm7 = _mm_mul_epi32(xmm7, xmm5);
697 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
698 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
699 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
700 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
701 xmm6 = _mm_mul_epi32(xmm6, xmm4);
702 xmm7 = _mm_add_epi64(xmm7, xmm6);
704 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
705 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
706 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
707 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
708 xmm6 = _mm_mul_epi32(xmm6, xmm3);
709 xmm7 = _mm_add_epi64(xmm7, xmm6);
711 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
712 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
713 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
714 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
715 xmm6 = _mm_mul_epi32(xmm6, xmm2);
716 xmm7 = _mm_add_epi64(xmm7, xmm6);
718 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
719 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
720 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
721 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
722 xmm6 = _mm_mul_epi32(xmm6, xmm1);
723 xmm7 = _mm_add_epi64(xmm7, xmm6);
725 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
726 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
727 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
728 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
729 xmm6 = _mm_mul_epi32(xmm6, xmm0);
730 xmm7 = _mm_add_epi64(xmm7, xmm6);
732 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
733 DATA64_RESULT1(xmm7);
737 else { /* order == 9, 10 */
738 if(order == 10) {
739 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
740 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
741 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
742 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
743 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
744 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
746 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
747 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
748 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
749 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
750 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
752 for(i = 0; i < (int)data_len; i++) {
753 //sum = 0;
754 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
755 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
756 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
757 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
758 xmm7 = _mm_mul_epi32(xmm7, xmm4);
760 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
761 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
762 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
763 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
764 xmm6 = _mm_mul_epi32(xmm6, xmm3);
765 xmm7 = _mm_add_epi64(xmm7, xmm6);
767 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
768 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
769 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
770 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
771 xmm6 = _mm_mul_epi32(xmm6, xmm2);
772 xmm7 = _mm_add_epi64(xmm7, xmm6);
774 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
775 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
776 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
777 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
778 xmm6 = _mm_mul_epi32(xmm6, xmm1);
779 xmm7 = _mm_add_epi64(xmm7, xmm6);
781 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
782 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
783 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
784 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
785 xmm6 = _mm_mul_epi32(xmm6, xmm0);
786 xmm7 = _mm_add_epi64(xmm7, xmm6);
788 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
789 DATA64_RESULT(xmm7);
792 else { /* order == 9 */
793 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
794 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
795 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
796 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
797 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
798 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
800 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
801 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
802 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
803 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
805 for(i = 0; i < (int)data_len; i++) {
806 //sum = 0;
807 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
808 xmm7 = _mm_cvtsi32_si128(data[i-9]);
809 xmm7 = _mm_mul_epi32(xmm7, xmm4);
811 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
812 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
813 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
814 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
815 xmm6 = _mm_mul_epi32(xmm6, xmm3);
816 xmm7 = _mm_add_epi64(xmm7, xmm6);
818 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
819 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
820 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
821 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
822 xmm6 = _mm_mul_epi32(xmm6, xmm2);
823 xmm7 = _mm_add_epi64(xmm7, xmm6);
825 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
826 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
827 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
828 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
829 xmm6 = _mm_mul_epi32(xmm6, xmm1);
830 xmm7 = _mm_add_epi64(xmm7, xmm6);
832 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
833 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
834 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
835 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
836 xmm6 = _mm_mul_epi32(xmm6, xmm0);
837 xmm7 = _mm_add_epi64(xmm7, xmm6);
839 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
840 DATA64_RESULT(xmm7);
845 else if(order > 4) { /* order == 5, 6, 7, 8 */
846 if(order > 6) { /* order == 7, 8 */
847 if(order == 8) {
848 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
849 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
850 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
851 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
852 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
854 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
855 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
856 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
857 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
859 for(i = 0; i < (int)data_len; i++) {
860 //sum = 0;
861 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
862 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
863 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
864 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
865 xmm7 = _mm_mul_epi32(xmm7, xmm3);
867 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
868 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
869 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
870 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
871 xmm6 = _mm_mul_epi32(xmm6, xmm2);
872 xmm7 = _mm_add_epi64(xmm7, xmm6);
874 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
875 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
876 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
877 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
878 xmm6 = _mm_mul_epi32(xmm6, xmm1);
879 xmm7 = _mm_add_epi64(xmm7, xmm6);
881 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
882 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
883 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
884 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
885 xmm6 = _mm_mul_epi32(xmm6, xmm0);
886 xmm7 = _mm_add_epi64(xmm7, xmm6);
888 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
889 DATA64_RESULT(xmm7);
892 else { /* order == 7 */
893 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
894 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
895 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
896 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
897 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
899 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
900 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
901 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
903 for(i = 0; i < (int)data_len; i++) {
904 //sum = 0;
905 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
906 xmm7 = _mm_cvtsi32_si128(data[i-7]);
907 xmm7 = _mm_mul_epi32(xmm7, xmm3);
909 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
910 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
911 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
912 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
913 xmm6 = _mm_mul_epi32(xmm6, xmm2);
914 xmm7 = _mm_add_epi64(xmm7, xmm6);
916 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
917 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
918 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
919 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
920 xmm6 = _mm_mul_epi32(xmm6, xmm1);
921 xmm7 = _mm_add_epi64(xmm7, xmm6);
923 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
924 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
925 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
926 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
927 xmm6 = _mm_mul_epi32(xmm6, xmm0);
928 xmm7 = _mm_add_epi64(xmm7, xmm6);
930 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
931 DATA64_RESULT(xmm7);
935 else { /* order == 5, 6 */
936 if(order == 6) {
937 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
938 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
939 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
940 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
942 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
943 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
944 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
946 for(i = 0; i < (int)data_len; i++) {
947 //sum = 0;
948 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
949 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
950 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
951 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
952 xmm7 = _mm_mul_epi32(xmm7, xmm2);
954 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
955 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
956 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
957 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
958 xmm6 = _mm_mul_epi32(xmm6, xmm1);
959 xmm7 = _mm_add_epi64(xmm7, xmm6);
961 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
962 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
963 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
964 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
965 xmm6 = _mm_mul_epi32(xmm6, xmm0);
966 xmm7 = _mm_add_epi64(xmm7, xmm6);
968 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
969 DATA64_RESULT(xmm7);
972 else { /* order == 5 */
973 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
974 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
975 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
976 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
978 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
979 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
981 for(i = 0; i < (int)data_len; i++) {
982 //sum = 0;
983 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
984 xmm7 = _mm_cvtsi32_si128(data[i-5]);
985 xmm7 = _mm_mul_epi32(xmm7, xmm2);
987 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
988 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
989 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
990 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
991 xmm6 = _mm_mul_epi32(xmm6, xmm1);
992 xmm7 = _mm_add_epi64(xmm7, xmm6);
994 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
995 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
996 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
997 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
998 xmm6 = _mm_mul_epi32(xmm6, xmm0);
999 xmm7 = _mm_add_epi64(xmm7, xmm6);
1001 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1002 DATA64_RESULT(xmm7);
1007 else { /* order == 1, 2, 3, 4 */
1008 if(order > 2) { /* order == 3, 4 */
1009 if(order == 4) {
1010 __m128i xmm0, xmm1, xmm6, xmm7;
1011 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1012 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1014 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1015 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1017 for(i = 0; i < (int)data_len; i++) {
1018 //sum = 0;
1019 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
1020 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
1021 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1022 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1023 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1025 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1026 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1027 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1028 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1029 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1030 xmm7 = _mm_add_epi64(xmm7, xmm6);
1032 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1033 DATA64_RESULT(xmm7);
1036 else { /* order == 3 */
1037 __m128i xmm0, xmm1, xmm6, xmm7;
1038 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1039 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1041 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1043 for(i = 0; i < (int)data_len; i++) {
1044 //sum = 0;
1045 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
1046 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1047 xmm7 = _mm_mul_epi32(xmm7, xmm1);
1049 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1050 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1051 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1052 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1053 xmm6 = _mm_mul_epi32(xmm6, xmm0);
1054 xmm7 = _mm_add_epi64(xmm7, xmm6);
1056 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1057 DATA64_RESULT(xmm7);
1061 else { /* order == 1, 2 */
1062 if(order == 2) {
1063 __m128i xmm0, xmm7;
1064 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1065 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1067 for(i = 0; i < (int)data_len; i++) {
1068 //sum = 0;
1069 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1070 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1071 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1072 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1073 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1075 xmm7 = _mm_add_epi64(xmm7, _mm_srli_si128(xmm7, 8));
1076 DATA64_RESULT(xmm7);
1079 else { /* order == 1 */
1080 __m128i xmm0, xmm7;
1081 xmm0 = _mm_cvtsi32_si128(qlp_coeff[0]);
1083 for(i = 0; i < (int)data_len; i++) {
1084 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
1085 xmm7 = _mm_cvtsi32_si128(data[i-1]);
1086 xmm7 = _mm_mul_epi32(xmm7, xmm0);
1087 DATA64_RESULT(xmm7);
1093 else { /* order > 12 */
1094 FLAC__int64 sum;
1095 for(i = 0; i < (int)data_len; i++) {
1096 sum = 0;
1097 switch(order) {
1098 case 32: sum += qlp_coeff[31] * (FLAC__int64)data[i-32];
1099 case 31: sum += qlp_coeff[30] * (FLAC__int64)data[i-31];
1100 case 30: sum += qlp_coeff[29] * (FLAC__int64)data[i-30];
1101 case 29: sum += qlp_coeff[28] * (FLAC__int64)data[i-29];
1102 case 28: sum += qlp_coeff[27] * (FLAC__int64)data[i-28];
1103 case 27: sum += qlp_coeff[26] * (FLAC__int64)data[i-27];
1104 case 26: sum += qlp_coeff[25] * (FLAC__int64)data[i-26];
1105 case 25: sum += qlp_coeff[24] * (FLAC__int64)data[i-25];
1106 case 24: sum += qlp_coeff[23] * (FLAC__int64)data[i-24];
1107 case 23: sum += qlp_coeff[22] * (FLAC__int64)data[i-23];
1108 case 22: sum += qlp_coeff[21] * (FLAC__int64)data[i-22];
1109 case 21: sum += qlp_coeff[20] * (FLAC__int64)data[i-21];
1110 case 20: sum += qlp_coeff[19] * (FLAC__int64)data[i-20];
1111 case 19: sum += qlp_coeff[18] * (FLAC__int64)data[i-19];
1112 case 18: sum += qlp_coeff[17] * (FLAC__int64)data[i-18];
1113 case 17: sum += qlp_coeff[16] * (FLAC__int64)data[i-17];
1114 case 16: sum += qlp_coeff[15] * (FLAC__int64)data[i-16];
1115 case 15: sum += qlp_coeff[14] * (FLAC__int64)data[i-15];
1116 case 14: sum += qlp_coeff[13] * (FLAC__int64)data[i-14];
1117 case 13: sum += qlp_coeff[12] * (FLAC__int64)data[i-13];
1118 sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
1119 sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
1120 sum += qlp_coeff[ 9] * (FLAC__int64)data[i-10];
1121 sum += qlp_coeff[ 8] * (FLAC__int64)data[i- 9];
1122 sum += qlp_coeff[ 7] * (FLAC__int64)data[i- 8];
1123 sum += qlp_coeff[ 6] * (FLAC__int64)data[i- 7];
1124 sum += qlp_coeff[ 5] * (FLAC__int64)data[i- 6];
1125 sum += qlp_coeff[ 4] * (FLAC__int64)data[i- 5];
1126 sum += qlp_coeff[ 3] * (FLAC__int64)data[i- 4];
1127 sum += qlp_coeff[ 2] * (FLAC__int64)data[i- 3];
1128 sum += qlp_coeff[ 1] * (FLAC__int64)data[i- 2];
1129 sum += qlp_coeff[ 0] * (FLAC__int64)data[i- 1];
1131 data[i] = residual[i] + (FLAC__int32)(sum >> lp_quantization);
1136 #endif /* FLAC__SSE4_1_SUPPORTED */
1137 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1138 #endif /* FLAC__NO_ASM */
1139 #endif /* FLAC__INTEGER_ONLY_LIBRARY */