1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE4_1_SUPPORTED
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
46 #include <smmintrin.h> /* SSE4.1 */
49 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
50 #define DATA64_RESULT(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srl_epi64(xmmN, cnt));
51 #define RESIDUAL64_RESULT1(xmmN) residual[i] = data[i] - _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
52 #define DATA64_RESULT1(xmmN) data[i] = residual[i] + _mm_cvtsi128_si32(_mm_srli_epi64(xmmN, lp_quantization));
54 #define RESIDUAL64_RESULT(xmmN) residual[i] = data[i] - (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
55 #define DATA64_RESULT(xmmN) data[i] = residual[i] + (FLAC__int32)(_mm_cvtsi128_si64(xmmN) >> lp_quantization);
56 #define RESIDUAL64_RESULT1(xmmN) RESIDUAL64_RESULT(xmmN)
57 #define DATA64_RESULT1(xmmN) DATA64_RESULT(xmmN)
60 FLAC__SSE_TARGET("sse4.1")
61 void FLAC__lpc_compute_residual_from_qlp_coefficients_wide_intrin_sse41(const FLAC__int32
*data
, unsigned data_len
, const FLAC__int32 qlp_coeff
[], unsigned order
, int lp_quantization
, FLAC__int32 residual
[])
65 __m128i cnt
= _mm_cvtsi32_si128(lp_quantization
);
68 FLAC__ASSERT(order
> 0);
69 FLAC__ASSERT(order
<= 32);
70 FLAC__ASSERT(lp_quantization
<= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
73 if(order
> 8) { /* order == 9, 10, 11, 12 */
74 if(order
> 10) { /* order == 11, 12 */
76 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
77 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0)); // 0 0 q[1] q[0]
78 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2)); // 0 0 q[3] q[2]
79 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4)); // 0 0 q[5] q[4]
80 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6)); // 0 0 q[7] q[6]
81 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8)); // 0 0 q[9] q[8]
82 xmm5
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+10)); // 0 0 q[11] q[10]
84 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
85 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
86 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
87 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
88 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
89 xmm5
= _mm_shuffle_epi32(xmm5
, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
91 for(i
= 0; i
< (int)data_len
; i
++) {
93 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
94 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
95 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-12)); // 0 0 d[i-11] d[i-12]
96 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
97 xmm7
= _mm_mul_epi32(xmm7
, xmm5
);
99 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
100 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
101 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
102 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
103 xmm6
= _mm_mul_epi32(xmm6
, xmm4
);
104 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
106 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
107 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
108 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
109 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
110 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
111 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
113 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
114 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
115 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
116 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
117 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
118 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
120 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
121 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
122 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
123 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
124 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
125 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
127 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
128 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
129 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
130 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
131 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
132 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
134 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
135 RESIDUAL64_RESULT1(xmm7
);
138 else { /* order == 11 */
139 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
140 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
141 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
142 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
143 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
144 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
145 xmm5
= _mm_cvtsi32_si128(qlp_coeff
[10]);
147 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
148 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
149 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
150 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
151 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
153 for(i
= 0; i
< (int)data_len
; i
++) {
155 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
156 xmm7
= _mm_cvtsi32_si128(data
[i
-11]);
157 xmm7
= _mm_mul_epi32(xmm7
, xmm5
);
159 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
160 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
161 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
162 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
163 xmm6
= _mm_mul_epi32(xmm6
, xmm4
);
164 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
166 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
167 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
168 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
169 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
170 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
171 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
173 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
174 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
175 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
176 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
177 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
178 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
180 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
181 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
182 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
183 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
184 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
185 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
187 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
188 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
189 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
190 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
191 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
192 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
194 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
195 RESIDUAL64_RESULT1(xmm7
);
199 else { /* order == 9, 10 */
201 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
202 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
203 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
204 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
205 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
206 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
208 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
209 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
210 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
211 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
212 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
214 for(i
= 0; i
< (int)data_len
; i
++) {
216 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
217 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
218 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
219 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
220 xmm7
= _mm_mul_epi32(xmm7
, xmm4
);
222 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
223 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
224 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
225 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
226 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
227 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
229 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
230 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
231 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
232 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
233 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
234 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
236 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
237 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
238 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
239 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
240 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
241 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
243 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
244 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
245 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
246 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
247 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
248 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
250 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
251 RESIDUAL64_RESULT(xmm7
);
254 else { /* order == 9 */
255 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
256 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
257 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
258 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
259 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
260 xmm4
= _mm_cvtsi32_si128(qlp_coeff
[8]);
262 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
263 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
264 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
265 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
267 for(i
= 0; i
< (int)data_len
; i
++) {
269 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
270 xmm7
= _mm_cvtsi32_si128(data
[i
-9]);
271 xmm7
= _mm_mul_epi32(xmm7
, xmm4
);
273 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
274 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
275 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
276 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
277 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
278 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
280 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
281 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
282 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
283 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
284 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
285 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
287 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
288 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
289 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
290 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
291 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
292 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
294 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
295 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
296 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
297 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
298 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
299 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
301 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
302 RESIDUAL64_RESULT(xmm7
);
307 else if(order
> 4) { /* order == 5, 6, 7, 8 */
308 if(order
> 6) { /* order == 7, 8 */
310 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
311 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
312 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
313 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
314 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
316 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
317 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
318 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
319 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
321 for(i
= 0; i
< (int)data_len
; i
++) {
323 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
324 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
325 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
326 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
327 xmm7
= _mm_mul_epi32(xmm7
, xmm3
);
329 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
330 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
331 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
332 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
333 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
334 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
336 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
337 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
338 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
339 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
340 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
341 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
343 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
344 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
345 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
346 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
347 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
348 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
350 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
351 RESIDUAL64_RESULT(xmm7
);
354 else { /* order == 7 */
355 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
356 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
357 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
358 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
359 xmm3
= _mm_cvtsi32_si128(qlp_coeff
[6]);
361 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
362 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
363 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
365 for(i
= 0; i
< (int)data_len
; i
++) {
367 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
368 xmm7
= _mm_cvtsi32_si128(data
[i
-7]);
369 xmm7
= _mm_mul_epi32(xmm7
, xmm3
);
371 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
372 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
373 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
374 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
375 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
376 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
378 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
379 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
380 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
381 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
382 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
383 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
385 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
386 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
387 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
388 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
389 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
390 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
392 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
393 RESIDUAL64_RESULT(xmm7
);
397 else { /* order == 5, 6 */
399 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
400 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
401 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
402 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
404 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
405 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
406 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
408 for(i
= 0; i
< (int)data_len
; i
++) {
410 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
411 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
412 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
413 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
414 xmm7
= _mm_mul_epi32(xmm7
, xmm2
);
416 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
417 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
418 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
419 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
420 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
421 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
423 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
424 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
425 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
426 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
427 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
428 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
430 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
431 RESIDUAL64_RESULT(xmm7
);
434 else { /* order == 5 */
435 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
436 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
437 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
438 xmm2
= _mm_cvtsi32_si128(qlp_coeff
[4]);
440 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
441 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
443 for(i
= 0; i
< (int)data_len
; i
++) {
445 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
446 xmm7
= _mm_cvtsi32_si128(data
[i
-5]);
447 xmm7
= _mm_mul_epi32(xmm7
, xmm2
);
449 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
450 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
451 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
452 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
453 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
454 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
456 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
457 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
458 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
459 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
460 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
461 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
463 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
464 RESIDUAL64_RESULT(xmm7
);
469 else { /* order == 1, 2, 3, 4 */
470 if(order
> 2) { /* order == 3, 4 */
472 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
473 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
474 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
476 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
477 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
479 for(i
= 0; i
< (int)data_len
; i
++) {
481 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
482 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
483 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
484 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
485 xmm7
= _mm_mul_epi32(xmm7
, xmm1
);
487 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
488 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
489 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
490 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
491 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
492 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
494 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
495 RESIDUAL64_RESULT(xmm7
);
498 else { /* order == 3 */
499 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
500 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
501 xmm1
= _mm_cvtsi32_si128(qlp_coeff
[2]);
503 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
505 for(i
= 0; i
< (int)data_len
; i
++) {
507 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
508 xmm7
= _mm_cvtsi32_si128(data
[i
-3]);
509 xmm7
= _mm_mul_epi32(xmm7
, xmm1
);
511 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
512 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
513 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
514 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
515 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
516 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
518 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
519 RESIDUAL64_RESULT(xmm7
);
523 else { /* order == 1, 2 */
526 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
527 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
529 for(i
= 0; i
< (int)data_len
; i
++) {
531 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
532 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
533 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
534 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
535 xmm7
= _mm_mul_epi32(xmm7
, xmm0
);
537 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
538 RESIDUAL64_RESULT(xmm7
);
541 else { /* order == 1 */
543 xmm0
= _mm_cvtsi32_si128(qlp_coeff
[0]);
545 for(i
= 0; i
< (int)data_len
; i
++) {
546 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
547 xmm7
= _mm_cvtsi32_si128(data
[i
-1]);
548 xmm7
= _mm_mul_epi32(xmm7
, xmm0
);
549 RESIDUAL64_RESULT(xmm7
);
555 else { /* order > 12 */
557 for(i
= 0; i
< (int)data_len
; i
++) {
560 case 32: sum
+= qlp_coeff
[31] * (FLAC__int64
)data
[i
-32];
561 case 31: sum
+= qlp_coeff
[30] * (FLAC__int64
)data
[i
-31];
562 case 30: sum
+= qlp_coeff
[29] * (FLAC__int64
)data
[i
-30];
563 case 29: sum
+= qlp_coeff
[28] * (FLAC__int64
)data
[i
-29];
564 case 28: sum
+= qlp_coeff
[27] * (FLAC__int64
)data
[i
-28];
565 case 27: sum
+= qlp_coeff
[26] * (FLAC__int64
)data
[i
-27];
566 case 26: sum
+= qlp_coeff
[25] * (FLAC__int64
)data
[i
-26];
567 case 25: sum
+= qlp_coeff
[24] * (FLAC__int64
)data
[i
-25];
568 case 24: sum
+= qlp_coeff
[23] * (FLAC__int64
)data
[i
-24];
569 case 23: sum
+= qlp_coeff
[22] * (FLAC__int64
)data
[i
-23];
570 case 22: sum
+= qlp_coeff
[21] * (FLAC__int64
)data
[i
-22];
571 case 21: sum
+= qlp_coeff
[20] * (FLAC__int64
)data
[i
-21];
572 case 20: sum
+= qlp_coeff
[19] * (FLAC__int64
)data
[i
-20];
573 case 19: sum
+= qlp_coeff
[18] * (FLAC__int64
)data
[i
-19];
574 case 18: sum
+= qlp_coeff
[17] * (FLAC__int64
)data
[i
-18];
575 case 17: sum
+= qlp_coeff
[16] * (FLAC__int64
)data
[i
-17];
576 case 16: sum
+= qlp_coeff
[15] * (FLAC__int64
)data
[i
-16];
577 case 15: sum
+= qlp_coeff
[14] * (FLAC__int64
)data
[i
-15];
578 case 14: sum
+= qlp_coeff
[13] * (FLAC__int64
)data
[i
-14];
579 case 13: sum
+= qlp_coeff
[12] * (FLAC__int64
)data
[i
-13];
580 sum
+= qlp_coeff
[11] * (FLAC__int64
)data
[i
-12];
581 sum
+= qlp_coeff
[10] * (FLAC__int64
)data
[i
-11];
582 sum
+= qlp_coeff
[ 9] * (FLAC__int64
)data
[i
-10];
583 sum
+= qlp_coeff
[ 8] * (FLAC__int64
)data
[i
- 9];
584 sum
+= qlp_coeff
[ 7] * (FLAC__int64
)data
[i
- 8];
585 sum
+= qlp_coeff
[ 6] * (FLAC__int64
)data
[i
- 7];
586 sum
+= qlp_coeff
[ 5] * (FLAC__int64
)data
[i
- 6];
587 sum
+= qlp_coeff
[ 4] * (FLAC__int64
)data
[i
- 5];
588 sum
+= qlp_coeff
[ 3] * (FLAC__int64
)data
[i
- 4];
589 sum
+= qlp_coeff
[ 2] * (FLAC__int64
)data
[i
- 3];
590 sum
+= qlp_coeff
[ 1] * (FLAC__int64
)data
[i
- 2];
591 sum
+= qlp_coeff
[ 0] * (FLAC__int64
)data
[i
- 1];
593 residual
[i
] = data
[i
] - (FLAC__int32
)(sum
>> lp_quantization
);
598 FLAC__SSE_TARGET("sse4.1")
599 void FLAC__lpc_restore_signal_wide_intrin_sse41(const FLAC__int32 residual
[], unsigned data_len
, const FLAC__int32 qlp_coeff
[], unsigned order
, int lp_quantization
, FLAC__int32 data
[])
602 #ifdef FLAC__CPU_IA32
603 __m128i cnt
= _mm_cvtsi32_si128(lp_quantization
);
606 FLAC__ASSERT(order
> 0);
607 FLAC__ASSERT(order
<= 32);
608 FLAC__ASSERT(lp_quantization
<= 32); /* there's no _mm_sra_epi64() so we have to use _mm_srl_epi64() */
611 if(order
> 8) { /* order == 9, 10, 11, 12 */
612 if(order
> 10) { /* order == 11, 12 */
614 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
615 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0)); // 0 0 q[1] q[0]
616 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2)); // 0 0 q[3] q[2]
617 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4)); // 0 0 q[5] q[4]
618 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6)); // 0 0 q[7] q[6]
619 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8)); // 0 0 q[9] q[8]
620 xmm5
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+10)); // 0 0 q[11] q[10]
622 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
623 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
624 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
625 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
626 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
627 xmm5
= _mm_shuffle_epi32(xmm5
, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
629 for(i
= 0; i
< (int)data_len
; i
++) {
631 //sum += qlp_coeff[11] * (FLAC__int64)data[i-12];
632 //sum += qlp_coeff[10] * (FLAC__int64)data[i-11];
633 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-12)); // 0 0 d[i-11] d[i-12]
634 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
635 xmm7
= _mm_mul_epi32(xmm7
, xmm5
);
637 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
638 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
639 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
640 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
641 xmm6
= _mm_mul_epi32(xmm6
, xmm4
);
642 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
644 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
645 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
646 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
647 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
648 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
649 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
651 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
652 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
653 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
654 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
655 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
656 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
658 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
659 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
660 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
661 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
662 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
663 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
665 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
666 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
667 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
668 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
669 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
670 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
672 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
673 DATA64_RESULT1(xmm7
);
676 else { /* order == 11 */
677 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
678 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
679 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
680 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
681 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
682 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
683 xmm5
= _mm_cvtsi32_si128(qlp_coeff
[10]);
685 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
686 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
687 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
688 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
689 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
691 for(i
= 0; i
< (int)data_len
; i
++) {
693 //sum = qlp_coeff[10] * (FLAC__int64)data[i-11];
694 xmm7
= _mm_cvtsi32_si128(data
[i
-11]);
695 xmm7
= _mm_mul_epi32(xmm7
, xmm5
);
697 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
698 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
699 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
700 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
701 xmm6
= _mm_mul_epi32(xmm6
, xmm4
);
702 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
704 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
705 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
706 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
707 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
708 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
709 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
711 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
712 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
713 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
714 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
715 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
716 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
718 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
719 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
720 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
721 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
722 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
723 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
725 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
726 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
727 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
728 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
729 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
730 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
732 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
733 DATA64_RESULT1(xmm7
);
737 else { /* order == 9, 10 */
739 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
740 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
741 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
742 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
743 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
744 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
746 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
747 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
748 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
749 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
750 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
752 for(i
= 0; i
< (int)data_len
; i
++) {
754 //sum += qlp_coeff[9] * (FLAC__int64)data[i-10];
755 //sum += qlp_coeff[8] * (FLAC__int64)data[i-9];
756 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
757 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
758 xmm7
= _mm_mul_epi32(xmm7
, xmm4
);
760 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
761 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
762 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
763 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
764 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
765 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
767 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
768 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
769 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
770 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
771 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
772 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
774 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
775 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
776 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
777 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
778 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
779 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
781 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
782 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
783 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
784 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
785 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
786 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
788 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
792 else { /* order == 9 */
793 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
794 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
795 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
796 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
797 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
798 xmm4
= _mm_cvtsi32_si128(qlp_coeff
[8]);
800 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
801 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
802 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
803 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
805 for(i
= 0; i
< (int)data_len
; i
++) {
807 //sum = qlp_coeff[8] * (FLAC__int64)data[i-9];
808 xmm7
= _mm_cvtsi32_si128(data
[i
-9]);
809 xmm7
= _mm_mul_epi32(xmm7
, xmm4
);
811 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
812 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
813 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
814 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
815 xmm6
= _mm_mul_epi32(xmm6
, xmm3
);
816 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
818 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
819 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
820 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
821 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
822 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
823 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
825 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
826 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
827 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
828 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
829 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
830 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
832 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
833 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
834 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
835 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
836 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
837 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
839 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
845 else if(order
> 4) { /* order == 5, 6, 7, 8 */
846 if(order
> 6) { /* order == 7, 8 */
848 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
849 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
850 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
851 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
852 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
854 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
855 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
856 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
857 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
859 for(i
= 0; i
< (int)data_len
; i
++) {
861 //sum += qlp_coeff[7] * (FLAC__int64)data[i-8];
862 //sum += qlp_coeff[6] * (FLAC__int64)data[i-7];
863 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
864 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
865 xmm7
= _mm_mul_epi32(xmm7
, xmm3
);
867 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
868 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
869 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
870 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
871 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
872 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
874 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
875 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
876 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
877 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
878 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
879 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
881 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
882 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
883 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
884 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
885 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
886 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
888 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
892 else { /* order == 7 */
893 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
894 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
895 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
896 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
897 xmm3
= _mm_cvtsi32_si128(qlp_coeff
[6]);
899 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
900 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
901 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
903 for(i
= 0; i
< (int)data_len
; i
++) {
905 //sum = qlp_coeff[6] * (FLAC__int64)data[i-7];
906 xmm7
= _mm_cvtsi32_si128(data
[i
-7]);
907 xmm7
= _mm_mul_epi32(xmm7
, xmm3
);
909 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
910 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
911 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
912 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
913 xmm6
= _mm_mul_epi32(xmm6
, xmm2
);
914 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
916 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
917 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
918 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
919 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
920 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
921 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
923 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
924 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
925 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
926 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
927 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
928 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
930 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
935 else { /* order == 5, 6 */
937 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
938 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
939 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
940 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
942 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
943 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
944 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
946 for(i
= 0; i
< (int)data_len
; i
++) {
948 //sum += qlp_coeff[5] * (FLAC__int64)data[i-6];
949 //sum += qlp_coeff[4] * (FLAC__int64)data[i-5];
950 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
951 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
952 xmm7
= _mm_mul_epi32(xmm7
, xmm2
);
954 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
955 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
956 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
957 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
958 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
959 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
961 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
962 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
963 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
964 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
965 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
966 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
968 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
972 else { /* order == 5 */
973 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
974 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
975 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
976 xmm2
= _mm_cvtsi32_si128(qlp_coeff
[4]);
978 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
979 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
981 for(i
= 0; i
< (int)data_len
; i
++) {
983 //sum = qlp_coeff[4] * (FLAC__int64)data[i-5];
984 xmm7
= _mm_cvtsi32_si128(data
[i
-5]);
985 xmm7
= _mm_mul_epi32(xmm7
, xmm2
);
987 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
988 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
989 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
990 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
991 xmm6
= _mm_mul_epi32(xmm6
, xmm1
);
992 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
994 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
995 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
996 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
997 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
998 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
999 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
1001 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
1002 DATA64_RESULT(xmm7
);
1007 else { /* order == 1, 2, 3, 4 */
1008 if(order
> 2) { /* order == 3, 4 */
1010 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
1011 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
1012 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
1014 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
1015 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
1017 for(i
= 0; i
< (int)data_len
; i
++) {
1019 //sum += qlp_coeff[3] * (FLAC__int64)data[i-4];
1020 //sum += qlp_coeff[2] * (FLAC__int64)data[i-3];
1021 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
1022 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
1023 xmm7
= _mm_mul_epi32(xmm7
, xmm1
);
1025 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1026 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1027 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
1028 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
1029 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
1030 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
1032 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
1033 DATA64_RESULT(xmm7
);
1036 else { /* order == 3 */
1037 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
1038 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
1039 xmm1
= _mm_cvtsi32_si128(qlp_coeff
[2]);
1041 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
1043 for(i
= 0; i
< (int)data_len
; i
++) {
1045 //sum = qlp_coeff[2] * (FLAC__int64)data[i-3];
1046 xmm7
= _mm_cvtsi32_si128(data
[i
-3]);
1047 xmm7
= _mm_mul_epi32(xmm7
, xmm1
);
1049 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1050 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1051 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
1052 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
1053 xmm6
= _mm_mul_epi32(xmm6
, xmm0
);
1054 xmm7
= _mm_add_epi64(xmm7
, xmm6
);
1056 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
1057 DATA64_RESULT(xmm7
);
1061 else { /* order == 1, 2 */
1064 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
1065 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
1067 for(i
= 0; i
< (int)data_len
; i
++) {
1069 //sum += qlp_coeff[1] * (FLAC__int64)data[i-2];
1070 //sum += qlp_coeff[0] * (FLAC__int64)data[i-1];
1071 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
1072 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
1073 xmm7
= _mm_mul_epi32(xmm7
, xmm0
);
1075 xmm7
= _mm_add_epi64(xmm7
, _mm_srli_si128(xmm7
, 8));
1076 DATA64_RESULT(xmm7
);
1079 else { /* order == 1 */
1081 xmm0
= _mm_cvtsi32_si128(qlp_coeff
[0]);
1083 for(i
= 0; i
< (int)data_len
; i
++) {
1084 //sum = qlp_coeff[0] * (FLAC__int64)data[i-1];
1085 xmm7
= _mm_cvtsi32_si128(data
[i
-1]);
1086 xmm7
= _mm_mul_epi32(xmm7
, xmm0
);
1087 DATA64_RESULT(xmm7
);
1093 else { /* order > 12 */
1095 for(i
= 0; i
< (int)data_len
; i
++) {
1098 case 32: sum
+= qlp_coeff
[31] * (FLAC__int64
)data
[i
-32];
1099 case 31: sum
+= qlp_coeff
[30] * (FLAC__int64
)data
[i
-31];
1100 case 30: sum
+= qlp_coeff
[29] * (FLAC__int64
)data
[i
-30];
1101 case 29: sum
+= qlp_coeff
[28] * (FLAC__int64
)data
[i
-29];
1102 case 28: sum
+= qlp_coeff
[27] * (FLAC__int64
)data
[i
-28];
1103 case 27: sum
+= qlp_coeff
[26] * (FLAC__int64
)data
[i
-27];
1104 case 26: sum
+= qlp_coeff
[25] * (FLAC__int64
)data
[i
-26];
1105 case 25: sum
+= qlp_coeff
[24] * (FLAC__int64
)data
[i
-25];
1106 case 24: sum
+= qlp_coeff
[23] * (FLAC__int64
)data
[i
-24];
1107 case 23: sum
+= qlp_coeff
[22] * (FLAC__int64
)data
[i
-23];
1108 case 22: sum
+= qlp_coeff
[21] * (FLAC__int64
)data
[i
-22];
1109 case 21: sum
+= qlp_coeff
[20] * (FLAC__int64
)data
[i
-21];
1110 case 20: sum
+= qlp_coeff
[19] * (FLAC__int64
)data
[i
-20];
1111 case 19: sum
+= qlp_coeff
[18] * (FLAC__int64
)data
[i
-19];
1112 case 18: sum
+= qlp_coeff
[17] * (FLAC__int64
)data
[i
-18];
1113 case 17: sum
+= qlp_coeff
[16] * (FLAC__int64
)data
[i
-17];
1114 case 16: sum
+= qlp_coeff
[15] * (FLAC__int64
)data
[i
-16];
1115 case 15: sum
+= qlp_coeff
[14] * (FLAC__int64
)data
[i
-15];
1116 case 14: sum
+= qlp_coeff
[13] * (FLAC__int64
)data
[i
-14];
1117 case 13: sum
+= qlp_coeff
[12] * (FLAC__int64
)data
[i
-13];
1118 sum
+= qlp_coeff
[11] * (FLAC__int64
)data
[i
-12];
1119 sum
+= qlp_coeff
[10] * (FLAC__int64
)data
[i
-11];
1120 sum
+= qlp_coeff
[ 9] * (FLAC__int64
)data
[i
-10];
1121 sum
+= qlp_coeff
[ 8] * (FLAC__int64
)data
[i
- 9];
1122 sum
+= qlp_coeff
[ 7] * (FLAC__int64
)data
[i
- 8];
1123 sum
+= qlp_coeff
[ 6] * (FLAC__int64
)data
[i
- 7];
1124 sum
+= qlp_coeff
[ 5] * (FLAC__int64
)data
[i
- 6];
1125 sum
+= qlp_coeff
[ 4] * (FLAC__int64
)data
[i
- 5];
1126 sum
+= qlp_coeff
[ 3] * (FLAC__int64
)data
[i
- 4];
1127 sum
+= qlp_coeff
[ 2] * (FLAC__int64
)data
[i
- 3];
1128 sum
+= qlp_coeff
[ 1] * (FLAC__int64
)data
[i
- 2];
1129 sum
+= qlp_coeff
[ 0] * (FLAC__int64
)data
[i
- 1];
1131 data
[i
] = residual
[i
] + (FLAC__int32
)(sum
>> lp_quantization
);
1136 #endif /* FLAC__SSE4_1_SUPPORTED */
1137 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1138 #endif /* FLAC__NO_ASM */
1139 #endif /* FLAC__INTEGER_ONLY_LIBRARY */