libFLAC/lpc_intrin_sseN.c : Disambiguate macro names.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
blobb5506dbca75dd470f6aacde9349e01df104e31e6
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
46 #include <emmintrin.h> /* SSE2 */
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
57 int i;
58 FLAC__int32 sum;
60 FLAC__ASSERT(order > 0);
61 FLAC__ASSERT(order <= 32);
62 FLAC__ASSERT(data_len > 0);
64 if(order <= 12) {
65 FLAC__int32 curr;
66 if(order > 8) { /* order == 9, 10, 11, 12 */
67 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
68 int r;
69 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
70 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
71 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
72 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
73 switch(order) /* ...and zero them out */
75 case 9:
76 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
77 case 10:
78 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
79 case 11:
80 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
82 xmm2 = _mm_setzero_si128();
83 xmm0 = _mm_packs_epi32(xmm0, xmm6);
84 xmm1 = _mm_packs_epi32(xmm1, xmm2);
86 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
87 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
88 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
89 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
90 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
91 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
92 xmm4 = _mm_packs_epi32(xmm4, xmm2);
93 xmm3 = _mm_packs_epi32(xmm3, xmm5);
95 xmm7 = _mm_slli_si128(xmm1, 2);
96 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
97 xmm2 = _mm_slli_si128(xmm0, 2);
99 /* xmm0, xmm1: qlp_coeff
100 xmm2, xmm7: qlp_coeff << 16 bit
101 xmm3, xmm4: data */
103 xmm6 = xmm4;
104 xmm6 = _mm_madd_epi16(xmm6, xmm1);
105 xmm5 = xmm3;
106 xmm5 = _mm_madd_epi16(xmm5, xmm0);
107 xmm6 = _mm_add_epi32(xmm6, xmm5);
108 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
109 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
111 RESIDUAL16_RESULT(xmm6);
113 data_len--;
114 r = data_len % 2;
116 if(r) {
117 xmm4 = _mm_slli_si128(xmm4, 2);
118 xmm6 = xmm3;
119 xmm3 = _mm_slli_si128(xmm3, 2);
120 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
121 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
123 xmm6 = xmm4;
124 xmm6 = _mm_madd_epi16(xmm6, xmm1);
125 xmm5 = xmm3;
126 xmm5 = _mm_madd_epi16(xmm5, xmm0);
127 xmm6 = _mm_add_epi32(xmm6, xmm5);
128 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
129 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
131 RESIDUAL16_RESULT(xmm6);
133 data_len--;
136 while(data_len) { /* data_len is a multiple of 2 */
137 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
138 xmm4 = _mm_slli_si128(xmm4, 4);
139 xmm6 = xmm3;
140 xmm3 = _mm_slli_si128(xmm3, 4);
141 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
142 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
144 xmm6 = xmm4;
145 xmm6 = _mm_madd_epi16(xmm6, xmm7);
146 xmm5 = xmm3;
147 xmm5 = _mm_madd_epi16(xmm5, xmm2);
148 xmm6 = _mm_add_epi32(xmm6, xmm5);
149 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
150 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
152 RESIDUAL16_RESULT(xmm6);
154 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
156 xmm6 = xmm4;
157 xmm6 = _mm_madd_epi16(xmm6, xmm1);
158 xmm5 = xmm3;
159 xmm5 = _mm_madd_epi16(xmm5, xmm0);
160 xmm6 = _mm_add_epi32(xmm6, xmm5);
161 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
162 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
164 RESIDUAL16_RESULT(xmm6);
166 data_len-=2;
168 #else /* 16 XMM registers available */
169 int r;
170 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
171 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
172 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
173 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
174 switch(order) /* ...and zero them out */
176 case 9:
177 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
178 case 10:
179 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
180 case 11:
181 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
183 xmm2 = _mm_setzero_si128();
184 xmm0 = _mm_packs_epi32(xmm0, xmm6);
185 xmm1 = _mm_packs_epi32(xmm1, xmm2);
187 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
188 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
189 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
190 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
191 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
192 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
193 xmm4 = _mm_packs_epi32(xmm4, xmm2);
194 xmm3 = _mm_packs_epi32(xmm3, xmm5);
196 xmm7 = _mm_slli_si128(xmm1, 2);
197 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
198 xmm2 = _mm_slli_si128(xmm0, 2);
200 xmm9 = _mm_slli_si128(xmm1, 4);
201 xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
202 xmm8 = _mm_slli_si128(xmm0, 4);
204 xmmB = _mm_slli_si128(xmm1, 6);
205 xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
206 xmmA = _mm_slli_si128(xmm0, 6);
208 /* xmm0, xmm1: qlp_coeff
209 xmm2, xmm7: qlp_coeff << 16 bit
210 xmm8, xmm9: qlp_coeff << 2*16 bit
211 xmmA, xmmB: qlp_coeff << 3*16 bit
212 xmm3, xmm4: data */
214 xmm6 = xmm4;
215 xmm6 = _mm_madd_epi16(xmm6, xmm1);
216 xmm5 = xmm3;
217 xmm5 = _mm_madd_epi16(xmm5, xmm0);
218 xmm6 = _mm_add_epi32(xmm6, xmm5);
219 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
220 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
222 RESIDUAL16_RESULT(xmm6);
224 data_len--;
225 r = data_len % 4;
227 while(r) {
228 xmm4 = _mm_slli_si128(xmm4, 2);
229 xmm6 = xmm3;
230 xmm3 = _mm_slli_si128(xmm3, 2);
231 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
232 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
234 xmm6 = xmm4;
235 xmm6 = _mm_madd_epi16(xmm6, xmm1);
236 xmm5 = xmm3;
237 xmm5 = _mm_madd_epi16(xmm5, xmm0);
238 xmm6 = _mm_add_epi32(xmm6, xmm5);
239 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
240 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
242 RESIDUAL16_RESULT(xmm6);
244 data_len--; r--;
247 while(data_len) { /* data_len is a multiple of 4 */
248 xmm4 = _mm_slli_si128(xmm4, 8);
249 xmm6 = xmm3;
250 xmm3 = _mm_slli_si128(xmm3, 8);
251 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
253 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
255 xmm6 = xmm4;
256 xmm6 = _mm_madd_epi16(xmm6, xmmB);
257 xmm5 = xmm3;
258 xmm5 = _mm_madd_epi16(xmm5, xmmA);
259 xmm6 = _mm_add_epi32(xmm6, xmm5);
260 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
261 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
263 RESIDUAL16_RESULT(xmm6);
265 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
267 xmm6 = xmm4;
268 xmm6 = _mm_madd_epi16(xmm6, xmm9);
269 xmm5 = xmm3;
270 xmm5 = _mm_madd_epi16(xmm5, xmm8);
271 xmm6 = _mm_add_epi32(xmm6, xmm5);
272 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
273 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
275 RESIDUAL16_RESULT(xmm6);
277 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
279 xmm6 = xmm4;
280 xmm6 = _mm_madd_epi16(xmm6, xmm7);
281 xmm5 = xmm3;
282 xmm5 = _mm_madd_epi16(xmm5, xmm2);
283 xmm6 = _mm_add_epi32(xmm6, xmm5);
284 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
285 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
287 RESIDUAL16_RESULT(xmm6);
289 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
291 xmm6 = xmm4;
292 xmm6 = _mm_madd_epi16(xmm6, xmm1);
293 xmm5 = xmm3;
294 xmm5 = _mm_madd_epi16(xmm5, xmm0);
295 xmm6 = _mm_add_epi32(xmm6, xmm5);
296 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
297 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
299 RESIDUAL16_RESULT(xmm6);
301 data_len-=4;
303 #endif
304 } /* endif(order > 8) */
305 else if(order > 4) { /* order == 5, 6, 7, 8 */
306 if(order > 6) { /* order == 7, 8 */
307 if(order == 8) {
308 __m128i xmm0, xmm1, xmm3, xmm6;
309 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
310 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
311 xmm0 = _mm_packs_epi32(xmm0, xmm1);
313 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
314 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
315 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
316 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
317 xmm3 = _mm_packs_epi32(xmm3, xmm1);
319 /* xmm0: qlp_coeff
320 xmm3: data */
322 xmm6 = xmm3;
323 xmm6 = _mm_madd_epi16(xmm6, xmm0);
324 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
325 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
327 RESIDUAL16_RESULT(xmm6);
329 data_len--;
331 while(data_len) {
332 xmm3 = _mm_slli_si128(xmm3, 2);
333 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
335 xmm6 = xmm3;
336 xmm6 = _mm_madd_epi16(xmm6, xmm0);
337 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
338 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
340 RESIDUAL16_RESULT(xmm6);
342 data_len--;
345 else { /* order == 7 */
346 int r;
347 __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
348 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
349 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
350 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
351 xmm0 = _mm_packs_epi32(xmm0, xmm1);
353 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
354 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
355 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
356 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
357 xmm3 = _mm_packs_epi32(xmm3, xmm1);
358 xmm2 = _mm_slli_si128(xmm0, 2);
360 /* xmm0: qlp_coeff
361 xmm2: qlp_coeff << 16 bit
362 xmm3: data */
364 xmm6 = xmm3;
365 xmm6 = _mm_madd_epi16(xmm6, xmm0);
366 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
367 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
369 RESIDUAL16_RESULT(xmm6);
371 data_len--;
372 r = data_len % 2;
374 if(r) {
375 xmm3 = _mm_slli_si128(xmm3, 2);
376 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
378 xmm6 = xmm3;
379 xmm6 = _mm_madd_epi16(xmm6, xmm0);
380 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
381 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
383 RESIDUAL16_RESULT(xmm6);
385 data_len--;
388 while(data_len) { /* data_len is a multiple of 2 */
389 xmm3 = _mm_slli_si128(xmm3, 4);
390 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
392 xmm6 = xmm3;
393 xmm6 = _mm_madd_epi16(xmm6, xmm2);
394 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
395 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
397 RESIDUAL16_RESULT(xmm6);
399 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
400 xmm6 = xmm3;
401 xmm6 = _mm_madd_epi16(xmm6, xmm0);
402 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
403 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
405 RESIDUAL16_RESULT(xmm6);
407 data_len-=2;
411 else { /* order == 5, 6 */
412 if(order == 6) {
413 int r;
414 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
415 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
416 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
417 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
418 xmm0 = _mm_packs_epi32(xmm0, xmm1);
420 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
421 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
422 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
423 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
424 xmm3 = _mm_packs_epi32(xmm3, xmm1);
425 xmm2 = _mm_slli_si128(xmm0, 2);
426 xmm4 = _mm_slli_si128(xmm0, 4);
428 /* xmm0: qlp_coeff
429 xmm2: qlp_coeff << 16 bit
430 xmm4: qlp_coeff << 2*16 bit
431 xmm3: data */
433 xmm6 = xmm3;
434 xmm6 = _mm_madd_epi16(xmm6, xmm0);
435 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
436 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
438 RESIDUAL16_RESULT(xmm6);
440 data_len--;
441 r = data_len % 3;
443 while(r) {
444 xmm3 = _mm_slli_si128(xmm3, 2);
445 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
447 xmm6 = xmm3;
448 xmm6 = _mm_madd_epi16(xmm6, xmm0);
449 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
450 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
452 RESIDUAL16_RESULT(xmm6);
454 data_len--; r--;
457 while(data_len) { /* data_len is a multiple of 3 */
458 xmm3 = _mm_slli_si128(xmm3, 6);
459 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
461 xmm6 = xmm3;
462 xmm6 = _mm_madd_epi16(xmm6, xmm4);
463 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
464 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
466 RESIDUAL16_RESULT(xmm6);
468 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
470 xmm6 = xmm3;
471 xmm6 = _mm_madd_epi16(xmm6, xmm2);
472 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
473 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
475 RESIDUAL16_RESULT(xmm6);
477 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
479 xmm6 = xmm3;
480 xmm6 = _mm_madd_epi16(xmm6, xmm0);
481 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
482 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
484 RESIDUAL16_RESULT(xmm6);
486 data_len-=3;
489 else { /* order == 5 */
490 int r;
491 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
492 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
493 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
494 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
495 xmm0 = _mm_packs_epi32(xmm0, xmm1);
497 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
498 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
499 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
500 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
501 xmm3 = _mm_packs_epi32(xmm3, xmm1);
502 xmm2 = _mm_slli_si128(xmm0, 2);
503 xmm4 = _mm_slli_si128(xmm0, 4);
504 xmm5 = _mm_slli_si128(xmm0, 6);
506 /* xmm0: qlp_coeff
507 xmm2: qlp_coeff << 16 bit
508 xmm4: qlp_coeff << 2*16 bit
509 xmm4: qlp_coeff << 3*16 bit
510 xmm3: data */
512 xmm6 = xmm3;
513 xmm6 = _mm_madd_epi16(xmm6, xmm0);
514 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
515 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
517 RESIDUAL16_RESULT(xmm6);
519 data_len--;
520 r = data_len % 4;
522 while(r) {
523 xmm3 = _mm_slli_si128(xmm3, 2);
524 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
526 xmm6 = xmm3;
527 xmm6 = _mm_madd_epi16(xmm6, xmm0);
528 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
529 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
531 RESIDUAL16_RESULT(xmm6);
533 data_len--; r--;
536 while(data_len) { /* data_len is a multiple of 4 */
537 xmm3 = _mm_slli_si128(xmm3, 8);
538 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
540 xmm6 = xmm3;
541 xmm6 = _mm_madd_epi16(xmm6, xmm5);
542 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
543 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
545 RESIDUAL16_RESULT(xmm6);
547 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
549 xmm6 = xmm3;
550 xmm6 = _mm_madd_epi16(xmm6, xmm4);
551 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
552 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
554 RESIDUAL16_RESULT(xmm6);
556 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
558 xmm6 = xmm3;
559 xmm6 = _mm_madd_epi16(xmm6, xmm2);
560 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
561 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
563 RESIDUAL16_RESULT(xmm6);
565 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
567 xmm6 = xmm3;
568 xmm6 = _mm_madd_epi16(xmm6, xmm0);
569 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
570 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
572 RESIDUAL16_RESULT(xmm6);
574 data_len-=4;
579 else { /* order == 1, 2, 3, 4 */
580 if(order > 2) {
581 if(order == 4) {
582 __m128i xmm0, xmm3, xmm6;
583 xmm6 = _mm_setzero_si128();
584 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
585 xmm0 = _mm_packs_epi32(xmm0, xmm6);
587 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
588 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
589 xmm3 = _mm_packs_epi32(xmm3, xmm6);
591 /* xmm0: qlp_coeff
592 xmm3: data */
594 xmm6 = xmm3;
595 xmm6 = _mm_madd_epi16(xmm6, xmm0);
596 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
598 RESIDUAL16_RESULT(xmm6);
600 data_len--;
602 while(data_len) {
603 xmm3 = _mm_slli_si128(xmm3, 2);
604 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
606 xmm6 = xmm3;
607 xmm6 = _mm_madd_epi16(xmm6, xmm0);
608 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
610 RESIDUAL16_RESULT(xmm6);
612 data_len--;
615 else { /* order == 3 */
616 int r;
617 __m128i xmm0, xmm1, xmm3, xmm6;
618 xmm6 = _mm_setzero_si128();
619 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
620 xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
621 xmm0 = _mm_packs_epi32(xmm0, xmm6);
623 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
624 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
625 xmm3 = _mm_packs_epi32(xmm3, xmm6);
626 xmm1 = _mm_slli_si128(xmm0, 2);
628 /* xmm0: qlp_coeff
629 xmm1: qlp_coeff << 16 bit
630 xmm3: data */
632 xmm6 = xmm3;
633 xmm6 = _mm_madd_epi16(xmm6, xmm0);
634 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
636 RESIDUAL16_RESULT(xmm6);
638 data_len--;
639 r = data_len % 2;
641 if(r) {
642 xmm3 = _mm_slli_si128(xmm3, 2);
643 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
645 xmm6 = xmm3;
646 xmm6 = _mm_madd_epi16(xmm6, xmm0);
647 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
649 RESIDUAL16_RESULT(xmm6);
651 data_len--;
654 while(data_len) { /* data_len is a multiple of 2 */
655 xmm3 = _mm_slli_si128(xmm3, 4);
657 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
659 xmm6 = xmm3;
660 xmm6 = _mm_madd_epi16(xmm6, xmm1);
661 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
663 RESIDUAL16_RESULT(xmm6);
665 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
667 xmm6 = xmm3;
668 xmm6 = _mm_madd_epi16(xmm6, xmm0);
669 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
671 RESIDUAL16_RESULT(xmm6);
673 data_len-=2;
677 else {
678 if(order == 2) {
679 __m128i xmm0, xmm3, xmm6;
680 xmm6 = _mm_setzero_si128();
681 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
682 xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
683 xmm0 = _mm_packs_epi32(xmm0, xmm6);
685 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
686 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
687 xmm3 = _mm_packs_epi32(xmm3, xmm6);
689 /* xmm0: qlp_coeff
690 xmm3: data */
692 xmm6 = xmm3;
693 xmm6 = _mm_madd_epi16(xmm6, xmm0);
695 RESIDUAL16_RESULT(xmm6);
697 data_len--;
699 while(data_len) {
700 xmm3 = _mm_slli_si128(xmm3, 2);
701 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
703 xmm6 = xmm3;
704 xmm6 = _mm_madd_epi16(xmm6, xmm0);
706 RESIDUAL16_RESULT(xmm6);
708 data_len--;
711 else { /* order == 1 */
712 for(i = 0; i < (int)data_len; i++)
713 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
718 else { /* order > 12 */
719 for(i = 0; i < (int)data_len; i++) {
720 sum = 0;
721 switch(order) {
722 case 32: sum += qlp_coeff[31] * data[i-32];
723 case 31: sum += qlp_coeff[30] * data[i-31];
724 case 30: sum += qlp_coeff[29] * data[i-30];
725 case 29: sum += qlp_coeff[28] * data[i-29];
726 case 28: sum += qlp_coeff[27] * data[i-28];
727 case 27: sum += qlp_coeff[26] * data[i-27];
728 case 26: sum += qlp_coeff[25] * data[i-26];
729 case 25: sum += qlp_coeff[24] * data[i-25];
730 case 24: sum += qlp_coeff[23] * data[i-24];
731 case 23: sum += qlp_coeff[22] * data[i-23];
732 case 22: sum += qlp_coeff[21] * data[i-22];
733 case 21: sum += qlp_coeff[20] * data[i-21];
734 case 20: sum += qlp_coeff[19] * data[i-20];
735 case 19: sum += qlp_coeff[18] * data[i-19];
736 case 18: sum += qlp_coeff[17] * data[i-18];
737 case 17: sum += qlp_coeff[16] * data[i-17];
738 case 16: sum += qlp_coeff[15] * data[i-16];
739 case 15: sum += qlp_coeff[14] * data[i-15];
740 case 14: sum += qlp_coeff[13] * data[i-14];
741 case 13: sum += qlp_coeff[12] * data[i-13];
742 sum += qlp_coeff[11] * data[i-12];
743 sum += qlp_coeff[10] * data[i-11];
744 sum += qlp_coeff[ 9] * data[i-10];
745 sum += qlp_coeff[ 8] * data[i- 9];
746 sum += qlp_coeff[ 7] * data[i- 8];
747 sum += qlp_coeff[ 6] * data[i- 7];
748 sum += qlp_coeff[ 5] * data[i- 6];
749 sum += qlp_coeff[ 4] * data[i- 5];
750 sum += qlp_coeff[ 3] * data[i- 4];
751 sum += qlp_coeff[ 2] * data[i- 3];
752 sum += qlp_coeff[ 1] * data[i- 2];
753 sum += qlp_coeff[ 0] * data[i- 1];
755 residual[i] = data[i] - (sum >> lp_quantization);
760 FLAC__SSE_TARGET("sse2")
761 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
763 int i;
765 FLAC__ASSERT(order > 0);
766 FLAC__ASSERT(order <= 32);
768 if(order <= 12) {
769 if(order > 8) { /* order == 9, 10, 11, 12 */
770 if(order > 10) { /* order == 11, 12 */
771 if(order == 12) {
772 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
773 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
774 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
775 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
776 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
777 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
778 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
780 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
781 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
782 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
783 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
784 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
785 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
787 for(i = 0; i < (int)data_len; i++) {
788 //sum = 0;
789 //sum += qlp_coeff[11] * data[i-12];
790 //sum += qlp_coeff[10] * data[i-11];
791 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
792 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
793 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
795 //sum += qlp_coeff[9] * data[i-10];
796 //sum += qlp_coeff[8] * data[i-9];
797 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
798 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
799 xmm6 = _mm_mul_epu32(xmm6, xmm4);
800 xmm7 = _mm_add_epi32(xmm7, xmm6);
802 //sum += qlp_coeff[7] * data[i-8];
803 //sum += qlp_coeff[6] * data[i-7];
804 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
805 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
806 xmm6 = _mm_mul_epu32(xmm6, xmm3);
807 xmm7 = _mm_add_epi32(xmm7, xmm6);
809 //sum += qlp_coeff[5] * data[i-6];
810 //sum += qlp_coeff[4] * data[i-5];
811 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
812 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
813 xmm6 = _mm_mul_epu32(xmm6, xmm2);
814 xmm7 = _mm_add_epi32(xmm7, xmm6);
816 //sum += qlp_coeff[3] * data[i-4];
817 //sum += qlp_coeff[2] * data[i-3];
818 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
819 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
820 xmm6 = _mm_mul_epu32(xmm6, xmm1);
821 xmm7 = _mm_add_epi32(xmm7, xmm6);
823 //sum += qlp_coeff[1] * data[i-2];
824 //sum += qlp_coeff[0] * data[i-1];
825 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
826 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
827 xmm6 = _mm_mul_epu32(xmm6, xmm0);
828 xmm7 = _mm_add_epi32(xmm7, xmm6);
830 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
831 RESIDUAL32_RESULT(xmm7);
834 else { /* order == 11 */
835 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
836 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
837 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
838 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
839 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
840 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
841 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
843 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
844 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
845 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
846 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
847 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
849 for(i = 0; i < (int)data_len; i++) {
850 //sum = 0;
851 //sum = qlp_coeff[10] * data[i-11];
852 xmm7 = _mm_cvtsi32_si128(data[i-11]);
853 xmm7 = _mm_mul_epu32(xmm7, xmm5);
855 //sum += qlp_coeff[9] * data[i-10];
856 //sum += qlp_coeff[8] * data[i-9];
857 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
858 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859 xmm6 = _mm_mul_epu32(xmm6, xmm4);
860 xmm7 = _mm_add_epi32(xmm7, xmm6);
862 //sum += qlp_coeff[7] * data[i-8];
863 //sum += qlp_coeff[6] * data[i-7];
864 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
865 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
866 xmm6 = _mm_mul_epu32(xmm6, xmm3);
867 xmm7 = _mm_add_epi32(xmm7, xmm6);
869 //sum += qlp_coeff[5] * data[i-6];
870 //sum += qlp_coeff[4] * data[i-5];
871 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
872 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
873 xmm6 = _mm_mul_epu32(xmm6, xmm2);
874 xmm7 = _mm_add_epi32(xmm7, xmm6);
876 //sum += qlp_coeff[3] * data[i-4];
877 //sum += qlp_coeff[2] * data[i-3];
878 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
879 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
880 xmm6 = _mm_mul_epu32(xmm6, xmm1);
881 xmm7 = _mm_add_epi32(xmm7, xmm6);
883 //sum += qlp_coeff[1] * data[i-2];
884 //sum += qlp_coeff[0] * data[i-1];
885 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
886 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
887 xmm6 = _mm_mul_epu32(xmm6, xmm0);
888 xmm7 = _mm_add_epi32(xmm7, xmm6);
890 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
891 RESIDUAL32_RESULT(xmm7);
895 else { /* order == 9, 10 */
896 if(order == 10) {
897 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
898 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
899 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
900 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
901 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
902 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
904 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
905 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
906 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
907 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
908 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
910 for(i = 0; i < (int)data_len; i++) {
911 //sum = 0;
912 //sum += qlp_coeff[9] * data[i-10];
913 //sum += qlp_coeff[8] * data[i-9];
914 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
915 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
916 xmm7 = _mm_mul_epu32(xmm7, xmm4);
918 //sum += qlp_coeff[7] * data[i-8];
919 //sum += qlp_coeff[6] * data[i-7];
920 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
921 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
922 xmm6 = _mm_mul_epu32(xmm6, xmm3);
923 xmm7 = _mm_add_epi32(xmm7, xmm6);
925 //sum += qlp_coeff[5] * data[i-6];
926 //sum += qlp_coeff[4] * data[i-5];
927 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
928 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
929 xmm6 = _mm_mul_epu32(xmm6, xmm2);
930 xmm7 = _mm_add_epi32(xmm7, xmm6);
932 //sum += qlp_coeff[3] * data[i-4];
933 //sum += qlp_coeff[2] * data[i-3];
934 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
935 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
936 xmm6 = _mm_mul_epu32(xmm6, xmm1);
937 xmm7 = _mm_add_epi32(xmm7, xmm6);
939 //sum += qlp_coeff[1] * data[i-2];
940 //sum += qlp_coeff[0] * data[i-1];
941 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
942 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
943 xmm6 = _mm_mul_epu32(xmm6, xmm0);
944 xmm7 = _mm_add_epi32(xmm7, xmm6);
946 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
947 RESIDUAL32_RESULT(xmm7);
950 else { /* order == 9 */
951 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
952 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
953 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
954 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
955 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
956 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
958 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
959 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
960 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
961 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
963 for(i = 0; i < (int)data_len; i++) {
964 //sum = 0;
965 //sum = qlp_coeff[8] * data[i-9];
966 xmm7 = _mm_cvtsi32_si128(data[i-9]);
967 xmm7 = _mm_mul_epu32(xmm7, xmm4);
969 //sum += qlp_coeff[7] * data[i-8];
970 //sum += qlp_coeff[6] * data[i-7];
971 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
972 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
973 xmm6 = _mm_mul_epu32(xmm6, xmm3);
974 xmm7 = _mm_add_epi32(xmm7, xmm6);
976 //sum += qlp_coeff[5] * data[i-6];
977 //sum += qlp_coeff[4] * data[i-5];
978 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
979 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
980 xmm6 = _mm_mul_epu32(xmm6, xmm2);
981 xmm7 = _mm_add_epi32(xmm7, xmm6);
983 //sum += qlp_coeff[3] * data[i-4];
984 //sum += qlp_coeff[2] * data[i-3];
985 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
986 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
987 xmm6 = _mm_mul_epu32(xmm6, xmm1);
988 xmm7 = _mm_add_epi32(xmm7, xmm6);
990 //sum += qlp_coeff[1] * data[i-2];
991 //sum += qlp_coeff[0] * data[i-1];
992 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
993 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
994 xmm6 = _mm_mul_epu32(xmm6, xmm0);
995 xmm7 = _mm_add_epi32(xmm7, xmm6);
997 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
998 RESIDUAL32_RESULT(xmm7);
1003 else if(order > 4) { /* order == 5, 6, 7, 8 */
1004 if(order > 6) { /* order == 7, 8 */
1005 if(order == 8) {
1006 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1007 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1008 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1009 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1010 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
1012 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1013 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1014 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1015 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
1017 for(i = 0; i < (int)data_len; i++) {
1018 //sum = 0;
1019 //sum += qlp_coeff[7] * data[i-8];
1020 //sum += qlp_coeff[6] * data[i-7];
1021 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
1022 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1023 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1025 //sum += qlp_coeff[5] * data[i-6];
1026 //sum += qlp_coeff[4] * data[i-5];
1027 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1028 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1029 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1030 xmm7 = _mm_add_epi32(xmm7, xmm6);
1032 //sum += qlp_coeff[3] * data[i-4];
1033 //sum += qlp_coeff[2] * data[i-3];
1034 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1035 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1036 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1037 xmm7 = _mm_add_epi32(xmm7, xmm6);
1039 //sum += qlp_coeff[1] * data[i-2];
1040 //sum += qlp_coeff[0] * data[i-1];
1041 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1042 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1043 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1044 xmm7 = _mm_add_epi32(xmm7, xmm6);
1046 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1047 RESIDUAL32_RESULT(xmm7);
1050 else { /* order == 7 */
1051 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
1052 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1053 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1054 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1055 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
1057 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1058 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1059 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1061 for(i = 0; i < (int)data_len; i++) {
1062 //sum = 0;
1063 //sum = qlp_coeff[6] * data[i-7];
1064 xmm7 = _mm_cvtsi32_si128(data[i-7]);
1065 xmm7 = _mm_mul_epu32(xmm7, xmm3);
1067 //sum += qlp_coeff[5] * data[i-6];
1068 //sum += qlp_coeff[4] * data[i-5];
1069 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1070 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1071 xmm6 = _mm_mul_epu32(xmm6, xmm2);
1072 xmm7 = _mm_add_epi32(xmm7, xmm6);
1074 //sum += qlp_coeff[3] * data[i-4];
1075 //sum += qlp_coeff[2] * data[i-3];
1076 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1077 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1078 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1079 xmm7 = _mm_add_epi32(xmm7, xmm6);
1081 //sum += qlp_coeff[1] * data[i-2];
1082 //sum += qlp_coeff[0] * data[i-1];
1083 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1084 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1085 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1086 xmm7 = _mm_add_epi32(xmm7, xmm6);
1088 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1089 RESIDUAL32_RESULT(xmm7);
1093 else { /* order == 5, 6 */
1094 if(order == 6) {
1095 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1096 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1097 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1098 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
1100 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1101 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1102 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
1104 for(i = 0; i < (int)data_len; i++) {
1105 //sum = 0;
1106 //sum += qlp_coeff[5] * data[i-6];
1107 //sum += qlp_coeff[4] * data[i-5];
1108 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
1109 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1110 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1112 //sum += qlp_coeff[3] * data[i-4];
1113 //sum += qlp_coeff[2] * data[i-3];
1114 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1115 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1116 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1117 xmm7 = _mm_add_epi32(xmm7, xmm6);
1119 //sum += qlp_coeff[1] * data[i-2];
1120 //sum += qlp_coeff[0] * data[i-1];
1121 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1122 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1123 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1124 xmm7 = _mm_add_epi32(xmm7, xmm6);
1126 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1127 RESIDUAL32_RESULT(xmm7);
1130 else { /* order == 5 */
1131 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
1132 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1133 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1134 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
1136 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1137 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1139 for(i = 0; i < (int)data_len; i++) {
1140 //sum = 0;
1141 //sum = qlp_coeff[4] * data[i-5];
1142 xmm7 = _mm_cvtsi32_si128(data[i-5]);
1143 xmm7 = _mm_mul_epu32(xmm7, xmm2);
1145 //sum += qlp_coeff[3] * data[i-4];
1146 //sum += qlp_coeff[2] * data[i-3];
1147 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1148 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1149 xmm6 = _mm_mul_epu32(xmm6, xmm1);
1150 xmm7 = _mm_add_epi32(xmm7, xmm6);
1152 //sum += qlp_coeff[1] * data[i-2];
1153 //sum += qlp_coeff[0] * data[i-1];
1154 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1155 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1156 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1157 xmm7 = _mm_add_epi32(xmm7, xmm6);
1159 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1160 RESIDUAL32_RESULT(xmm7);
1165 else { /* order == 1, 2, 3, 4 */
1166 if(order > 2) { /* order == 3, 4 */
1167 if(order == 4) {
1168 __m128i xmm0, xmm1, xmm6, xmm7;
1169 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1170 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
1172 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1173 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
1175 for(i = 0; i < (int)data_len; i++) {
1176 //sum = 0;
1177 //sum += qlp_coeff[3] * data[i-4];
1178 //sum += qlp_coeff[2] * data[i-3];
1179 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
1180 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1181 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1183 //sum += qlp_coeff[1] * data[i-2];
1184 //sum += qlp_coeff[0] * data[i-1];
1185 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1186 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1187 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1188 xmm7 = _mm_add_epi32(xmm7, xmm6);
1190 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1191 RESIDUAL32_RESULT(xmm7);
1194 else { /* order == 3 */
1195 __m128i xmm0, xmm1, xmm6, xmm7;
1196 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1197 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
1199 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1201 for(i = 0; i < (int)data_len; i++) {
1202 //sum = 0;
1203 //sum = qlp_coeff[2] * data[i-3];
1204 xmm7 = _mm_cvtsi32_si128(data[i-3]);
1205 xmm7 = _mm_mul_epu32(xmm7, xmm1);
1207 //sum += qlp_coeff[1] * data[i-2];
1208 //sum += qlp_coeff[0] * data[i-1];
1209 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1210 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
1211 xmm6 = _mm_mul_epu32(xmm6, xmm0);
1212 xmm7 = _mm_add_epi32(xmm7, xmm6);
1214 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1215 RESIDUAL32_RESULT(xmm7);
1219 else { /* order == 1, 2 */
1220 if(order == 2) {
1221 __m128i xmm0, xmm7;
1222 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
1223 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
1225 for(i = 0; i < (int)data_len; i++) {
1226 //sum = 0;
1227 //sum += qlp_coeff[1] * data[i-2];
1228 //sum += qlp_coeff[0] * data[i-1];
1229 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
1230 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
1231 xmm7 = _mm_mul_epu32(xmm7, xmm0);
1233 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
1234 RESIDUAL32_RESULT(xmm7);
1237 else { /* order == 1 */
1238 for(i = 0; i < (int)data_len; i++)
1239 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1244 else { /* order > 12 */
1245 FLAC__int32 sum;
1246 for(i = 0; i < (int)data_len; i++) {
1247 sum = 0;
1248 switch(order) {
1249 case 32: sum += qlp_coeff[31] * data[i-32];
1250 case 31: sum += qlp_coeff[30] * data[i-31];
1251 case 30: sum += qlp_coeff[29] * data[i-30];
1252 case 29: sum += qlp_coeff[28] * data[i-29];
1253 case 28: sum += qlp_coeff[27] * data[i-28];
1254 case 27: sum += qlp_coeff[26] * data[i-27];
1255 case 26: sum += qlp_coeff[25] * data[i-26];
1256 case 25: sum += qlp_coeff[24] * data[i-25];
1257 case 24: sum += qlp_coeff[23] * data[i-24];
1258 case 23: sum += qlp_coeff[22] * data[i-23];
1259 case 22: sum += qlp_coeff[21] * data[i-22];
1260 case 21: sum += qlp_coeff[20] * data[i-21];
1261 case 20: sum += qlp_coeff[19] * data[i-20];
1262 case 19: sum += qlp_coeff[18] * data[i-19];
1263 case 18: sum += qlp_coeff[17] * data[i-18];
1264 case 17: sum += qlp_coeff[16] * data[i-17];
1265 case 16: sum += qlp_coeff[15] * data[i-16];
1266 case 15: sum += qlp_coeff[14] * data[i-15];
1267 case 14: sum += qlp_coeff[13] * data[i-14];
1268 case 13: sum += qlp_coeff[12] * data[i-13];
1269 sum += qlp_coeff[11] * data[i-12];
1270 sum += qlp_coeff[10] * data[i-11];
1271 sum += qlp_coeff[ 9] * data[i-10];
1272 sum += qlp_coeff[ 8] * data[i- 9];
1273 sum += qlp_coeff[ 7] * data[i- 8];
1274 sum += qlp_coeff[ 6] * data[i- 7];
1275 sum += qlp_coeff[ 5] * data[i- 6];
1276 sum += qlp_coeff[ 4] * data[i- 5];
1277 sum += qlp_coeff[ 3] * data[i- 4];
1278 sum += qlp_coeff[ 2] * data[i- 3];
1279 sum += qlp_coeff[ 1] * data[i- 2];
1280 sum += qlp_coeff[ 0] * data[i- 1];
1282 residual[i] = data[i] - (sum >> lp_quantization);
1287 FLAC__SSE_TARGET("sse2")
1288 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
1290 int i;
1291 FLAC__int32 sum;
1292 if (order < 8) {
1293 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
1294 return;
1297 FLAC__ASSERT(order > 0);
1298 FLAC__ASSERT(order <= 32);
1299 FLAC__ASSERT(data_len > 0);
1301 if(order <= 12) {
1302 FLAC__int32 curr;
1303 if(order > 8) { /* order == 9, 10, 11, 12 */
1304 #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
1305 int r;
1306 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
1307 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1308 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1309 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1310 switch(order) /* ...and zero them out */
1312 case 9:
1313 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1314 case 10:
1315 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1316 case 11:
1317 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1319 xmm2 = _mm_setzero_si128();
1320 xmm0 = _mm_packs_epi32(xmm0, xmm6);
1321 xmm1 = _mm_packs_epi32(xmm1, xmm2);
1323 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1324 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1325 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1326 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1327 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1328 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1329 xmm4 = _mm_packs_epi32(xmm4, xmm2);
1330 xmm3 = _mm_packs_epi32(xmm3, xmm5);
1332 xmm7 = _mm_slli_si128(xmm1, 2);
1333 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1334 xmm2 = _mm_slli_si128(xmm0, 2);
1336 /* xmm0, xmm1: qlp_coeff
1337 xmm2, xmm7: qlp_coeff << 16 bit
1338 xmm3, xmm4: data */
1340 xmm6 = xmm4;
1341 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1342 xmm5 = xmm3;
1343 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1344 xmm6 = _mm_add_epi32(xmm6, xmm5);
1345 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1346 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1348 DATA16_RESULT(xmm6);
1350 data_len--;
1351 r = data_len % 2;
1353 if(r) {
1354 xmm4 = _mm_slli_si128(xmm4, 2);
1355 xmm6 = xmm3;
1356 xmm3 = _mm_slli_si128(xmm3, 2);
1357 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1358 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1360 xmm6 = xmm4;
1361 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1362 xmm5 = xmm3;
1363 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1364 xmm6 = _mm_add_epi32(xmm6, xmm5);
1365 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1366 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1368 DATA16_RESULT(xmm6);
1370 data_len--;
1373 while(data_len) { /* data_len is a multiple of 2 */
1374 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1375 xmm4 = _mm_slli_si128(xmm4, 4);
1376 xmm6 = xmm3;
1377 xmm3 = _mm_slli_si128(xmm3, 4);
1378 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 12));
1379 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1381 xmm6 = xmm4;
1382 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1383 xmm5 = xmm3;
1384 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1385 xmm6 = _mm_add_epi32(xmm6, xmm5);
1386 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1387 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1389 DATA16_RESULT(xmm6);
1391 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1393 xmm6 = xmm4;
1394 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1395 xmm5 = xmm3;
1396 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1397 xmm6 = _mm_add_epi32(xmm6, xmm5);
1398 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1399 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1401 DATA16_RESULT(xmm6);
1403 data_len-=2;
1405 #else /* 16 XMM registers available */
1406 int r;
1407 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9, xmmA, xmmB;
1408 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1409 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1410 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
1411 switch(order) /* ...and zero them out */
1413 case 9:
1414 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
1415 case 10:
1416 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
1417 case 11:
1418 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
1420 xmm2 = _mm_setzero_si128();
1421 xmm0 = _mm_packs_epi32(xmm0, xmm6);
1422 xmm1 = _mm_packs_epi32(xmm1, xmm2);
1424 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
1425 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
1426 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1427 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
1428 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
1429 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1430 xmm4 = _mm_packs_epi32(xmm4, xmm2);
1431 xmm3 = _mm_packs_epi32(xmm3, xmm5);
1433 xmm7 = _mm_slli_si128(xmm1, 2);
1434 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
1435 xmm2 = _mm_slli_si128(xmm0, 2);
1437 xmm9 = _mm_slli_si128(xmm1, 4);
1438 xmm9 = _mm_or_si128(xmm9, _mm_srli_si128(xmm0, 12));
1439 xmm8 = _mm_slli_si128(xmm0, 4);
1441 xmmB = _mm_slli_si128(xmm1, 6);
1442 xmmB = _mm_or_si128(xmmB, _mm_srli_si128(xmm0, 10));
1443 xmmA = _mm_slli_si128(xmm0, 6);
1445 /* xmm0, xmm1: qlp_coeff
1446 xmm2, xmm7: qlp_coeff << 16 bit
1447 xmm8, xmm9: qlp_coeff << 2*16 bit
1448 xmmA, xmmB: qlp_coeff << 3*16 bit
1449 xmm3, xmm4: data */
1451 xmm6 = xmm4;
1452 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1453 xmm5 = xmm3;
1454 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1455 xmm6 = _mm_add_epi32(xmm6, xmm5);
1456 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1457 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1459 DATA16_RESULT(xmm6);
1461 data_len--;
1462 r = data_len % 4;
1464 while(r) {
1465 xmm4 = _mm_slli_si128(xmm4, 2);
1466 xmm6 = xmm3;
1467 xmm3 = _mm_slli_si128(xmm3, 2);
1468 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 14));
1469 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1471 xmm6 = xmm4;
1472 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1473 xmm5 = xmm3;
1474 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1475 xmm6 = _mm_add_epi32(xmm6, xmm5);
1476 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1477 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1479 DATA16_RESULT(xmm6);
1481 data_len--; r--;
1484 while(data_len) { /* data_len is a multiple of 4 */
1485 xmm4 = _mm_slli_si128(xmm4, 8);
1486 xmm6 = xmm3;
1487 xmm3 = _mm_slli_si128(xmm3, 8);
1488 xmm4 = _mm_or_si128(xmm4, _mm_srli_si128(xmm6, 8));
1490 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1492 xmm6 = xmm4;
1493 xmm6 = _mm_madd_epi16(xmm6, xmmB);
1494 xmm5 = xmm3;
1495 xmm5 = _mm_madd_epi16(xmm5, xmmA);
1496 xmm6 = _mm_add_epi32(xmm6, xmm5);
1497 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1498 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1500 DATA16_RESULT(xmm6);
1502 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1504 xmm6 = xmm4;
1505 xmm6 = _mm_madd_epi16(xmm6, xmm9);
1506 xmm5 = xmm3;
1507 xmm5 = _mm_madd_epi16(xmm5, xmm8);
1508 xmm6 = _mm_add_epi32(xmm6, xmm5);
1509 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1510 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1512 DATA16_RESULT(xmm6);
1514 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1516 xmm6 = xmm4;
1517 xmm6 = _mm_madd_epi16(xmm6, xmm7);
1518 xmm5 = xmm3;
1519 xmm5 = _mm_madd_epi16(xmm5, xmm2);
1520 xmm6 = _mm_add_epi32(xmm6, xmm5);
1521 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1522 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1524 DATA16_RESULT(xmm6);
1526 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1528 xmm6 = xmm4;
1529 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1530 xmm5 = xmm3;
1531 xmm5 = _mm_madd_epi16(xmm5, xmm0);
1532 xmm6 = _mm_add_epi32(xmm6, xmm5);
1533 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1534 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1536 DATA16_RESULT(xmm6);
1538 data_len-=4;
1540 #endif
1541 } /* endif(order > 8) */
1542 else if(order > 4) { /* order == 5, 6, 7, 8 */
1543 if(order > 6) { /* order == 7, 8 */
1544 if(order == 8) {
1545 __m128i xmm0, xmm1, xmm3, xmm6;
1546 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1547 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1548 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1550 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1551 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1552 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1553 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1554 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1556 /* xmm0: qlp_coeff
1557 xmm3: data */
1559 xmm6 = xmm3;
1560 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1561 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1562 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1564 DATA16_RESULT(xmm6);
1566 data_len--;
1568 while(data_len) {
1569 xmm3 = _mm_slli_si128(xmm3, 2);
1570 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1572 xmm6 = xmm3;
1573 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1574 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1575 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1577 DATA16_RESULT(xmm6);
1579 data_len--;
1582 else { /* order == 7 */
1583 int r;
1584 __m128i xmm0, xmm1, xmm2, xmm3, xmm6;
1585 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1586 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1587 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4);
1588 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1590 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1591 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1592 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1593 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1594 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1595 xmm2 = _mm_slli_si128(xmm0, 2);
1597 /* xmm0: qlp_coeff
1598 xmm2: qlp_coeff << 16 bit
1599 xmm3: data */
1601 xmm6 = xmm3;
1602 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1603 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1604 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1606 DATA16_RESULT(xmm6);
1608 data_len--;
1609 r = data_len % 2;
1611 if(r) {
1612 xmm3 = _mm_slli_si128(xmm3, 2);
1613 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1615 xmm6 = xmm3;
1616 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1617 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1618 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1620 DATA16_RESULT(xmm6);
1622 data_len--;
1625 while(data_len) { /* data_len is a multiple of 2 */
1626 xmm3 = _mm_slli_si128(xmm3, 4);
1627 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1629 xmm6 = xmm3;
1630 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1631 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1632 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1634 DATA16_RESULT(xmm6);
1636 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1637 xmm6 = xmm3;
1638 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1639 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1640 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1642 DATA16_RESULT(xmm6);
1644 data_len-=2;
1648 else { /* order == 5, 6 */
1649 if(order == 6) {
1650 int r;
1651 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6;
1652 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1653 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1654 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8);
1655 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1657 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1658 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1659 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1660 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1661 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1662 xmm2 = _mm_slli_si128(xmm0, 2);
1663 xmm4 = _mm_slli_si128(xmm0, 4);
1665 /* xmm0: qlp_coeff
1666 xmm2: qlp_coeff << 16 bit
1667 xmm4: qlp_coeff << 2*16 bit
1668 xmm3: data */
1670 xmm6 = xmm3;
1671 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1672 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1673 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1675 DATA16_RESULT(xmm6);
1677 data_len--;
1678 r = data_len % 3;
1680 while(r) {
1681 xmm3 = _mm_slli_si128(xmm3, 2);
1682 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1684 xmm6 = xmm3;
1685 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1686 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1687 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1689 DATA16_RESULT(xmm6);
1691 data_len--; r--;
1694 while(data_len) { /* data_len is a multiple of 3 */
1695 xmm3 = _mm_slli_si128(xmm3, 6);
1696 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1698 xmm6 = xmm3;
1699 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1700 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1701 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1703 DATA16_RESULT(xmm6);
1705 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1707 xmm6 = xmm3;
1708 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1709 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1710 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1712 DATA16_RESULT(xmm6);
1714 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1716 xmm6 = xmm3;
1717 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1718 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1719 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1721 DATA16_RESULT(xmm6);
1723 data_len-=3;
1726 else { /* order == 5 */
1727 int r;
1728 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
1729 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1730 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1731 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12);
1732 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1734 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1735 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1736 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1737 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1738 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1739 xmm2 = _mm_slli_si128(xmm0, 2);
1740 xmm4 = _mm_slli_si128(xmm0, 4);
1741 xmm5 = _mm_slli_si128(xmm0, 6);
1743 /* xmm0: qlp_coeff
1744 xmm2: qlp_coeff << 16 bit
1745 xmm4: qlp_coeff << 2*16 bit
1746 xmm4: qlp_coeff << 3*16 bit
1747 xmm3: data */
1749 xmm6 = xmm3;
1750 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1751 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1752 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1754 DATA16_RESULT(xmm6);
1756 data_len--;
1757 r = data_len % 4;
1759 while(r) {
1760 xmm3 = _mm_slli_si128(xmm3, 2);
1761 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1763 xmm6 = xmm3;
1764 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1765 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1766 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1768 DATA16_RESULT(xmm6);
1770 data_len--; r--;
1773 while(data_len) { /* data_len is a multiple of 4 */
1774 xmm3 = _mm_slli_si128(xmm3, 8);
1775 xmm3 = _mm_insert_epi16(xmm3, curr, 3);
1777 xmm6 = xmm3;
1778 xmm6 = _mm_madd_epi16(xmm6, xmm5);
1779 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1780 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1782 DATA16_RESULT(xmm6);
1784 xmm3 = _mm_insert_epi16(xmm3, curr, 2);
1786 xmm6 = xmm3;
1787 xmm6 = _mm_madd_epi16(xmm6, xmm4);
1788 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1789 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1791 DATA16_RESULT(xmm6);
1793 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1795 xmm6 = xmm3;
1796 xmm6 = _mm_madd_epi16(xmm6, xmm2);
1797 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1798 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1800 DATA16_RESULT(xmm6);
1802 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1804 xmm6 = xmm3;
1805 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1806 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1807 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1809 DATA16_RESULT(xmm6);
1811 data_len-=4;
1816 else { /* order == 1, 2, 3, 4 */
1817 if(order > 2) {
1818 if(order == 4) {
1819 __m128i xmm0, xmm3, xmm6;
1820 xmm6 = _mm_setzero_si128();
1821 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1822 xmm0 = _mm_packs_epi32(xmm0, xmm6);
1824 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1825 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1826 xmm3 = _mm_packs_epi32(xmm3, xmm6);
1828 /* xmm0: qlp_coeff
1829 xmm3: data */
1831 xmm6 = xmm3;
1832 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1833 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1835 DATA16_RESULT(xmm6);
1837 data_len--;
1839 while(data_len) {
1840 xmm3 = _mm_slli_si128(xmm3, 2);
1841 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1843 xmm6 = xmm3;
1844 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1845 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1847 DATA16_RESULT(xmm6);
1849 data_len--;
1852 else { /* order == 3 */
1853 int r;
1854 __m128i xmm0, xmm1, xmm3, xmm6;
1855 xmm6 = _mm_setzero_si128();
1856 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1857 xmm0 = _mm_slli_si128(xmm0, 4); xmm0 = _mm_srli_si128(xmm0, 4);
1858 xmm0 = _mm_packs_epi32(xmm0, xmm6);
1860 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1861 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1862 xmm3 = _mm_packs_epi32(xmm3, xmm6);
1863 xmm1 = _mm_slli_si128(xmm0, 2);
1865 /* xmm0: qlp_coeff
1866 xmm1: qlp_coeff << 16 bit
1867 xmm3: data */
1869 xmm6 = xmm3;
1870 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1871 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1873 DATA16_RESULT(xmm6);
1875 data_len--;
1876 r = data_len % 2;
1878 if(r) {
1879 xmm3 = _mm_slli_si128(xmm3, 2);
1880 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1882 xmm6 = xmm3;
1883 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1884 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1886 DATA16_RESULT(xmm6);
1888 data_len--;
1891 while(data_len) { /* data_len is a multiple of 2 */
1892 xmm3 = _mm_slli_si128(xmm3, 4);
1894 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1896 xmm6 = xmm3;
1897 xmm6 = _mm_madd_epi16(xmm6, xmm1);
1898 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1900 DATA16_RESULT(xmm6);
1902 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1904 xmm6 = xmm3;
1905 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1906 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1908 DATA16_RESULT(xmm6);
1910 data_len-=2;
1914 else {
1915 if(order == 2) {
1916 __m128i xmm0, xmm3, xmm6;
1917 xmm6 = _mm_setzero_si128();
1918 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1919 xmm0 = _mm_slli_si128(xmm0, 8); xmm0 = _mm_srli_si128(xmm0, 8);
1920 xmm0 = _mm_packs_epi32(xmm0, xmm6);
1922 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1923 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1924 xmm3 = _mm_packs_epi32(xmm3, xmm6);
1926 /* xmm0: qlp_coeff
1927 xmm3: data */
1929 xmm6 = xmm3;
1930 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1932 DATA16_RESULT(xmm6);
1934 data_len--;
1936 while(data_len) {
1937 xmm3 = _mm_slli_si128(xmm3, 2);
1938 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1940 xmm6 = xmm3;
1941 xmm6 = _mm_madd_epi16(xmm6, xmm0);
1943 DATA16_RESULT(xmm6);
1945 data_len--;
1948 else { /* order == 1 */
1949 for(i = 0; i < (int)data_len; i++)
1950 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
1955 else { /* order > 12 */
1956 for(i = 0; i < (int)data_len; i++) {
1957 sum = 0;
1958 switch(order) {
1959 case 32: sum += qlp_coeff[31] * data[i-32];
1960 case 31: sum += qlp_coeff[30] * data[i-31];
1961 case 30: sum += qlp_coeff[29] * data[i-30];
1962 case 29: sum += qlp_coeff[28] * data[i-29];
1963 case 28: sum += qlp_coeff[27] * data[i-28];
1964 case 27: sum += qlp_coeff[26] * data[i-27];
1965 case 26: sum += qlp_coeff[25] * data[i-26];
1966 case 25: sum += qlp_coeff[24] * data[i-25];
1967 case 24: sum += qlp_coeff[23] * data[i-24];
1968 case 23: sum += qlp_coeff[22] * data[i-23];
1969 case 22: sum += qlp_coeff[21] * data[i-22];
1970 case 21: sum += qlp_coeff[20] * data[i-21];
1971 case 20: sum += qlp_coeff[19] * data[i-20];
1972 case 19: sum += qlp_coeff[18] * data[i-19];
1973 case 18: sum += qlp_coeff[17] * data[i-18];
1974 case 17: sum += qlp_coeff[16] * data[i-17];
1975 case 16: sum += qlp_coeff[15] * data[i-16];
1976 case 15: sum += qlp_coeff[14] * data[i-15];
1977 case 14: sum += qlp_coeff[13] * data[i-14];
1978 case 13: sum += qlp_coeff[12] * data[i-13];
1979 sum += qlp_coeff[11] * data[i-12];
1980 sum += qlp_coeff[10] * data[i-11];
1981 sum += qlp_coeff[ 9] * data[i-10];
1982 sum += qlp_coeff[ 8] * data[i- 9];
1983 sum += qlp_coeff[ 7] * data[i- 8];
1984 sum += qlp_coeff[ 6] * data[i- 7];
1985 sum += qlp_coeff[ 5] * data[i- 6];
1986 sum += qlp_coeff[ 4] * data[i- 5];
1987 sum += qlp_coeff[ 3] * data[i- 4];
1988 sum += qlp_coeff[ 2] * data[i- 3];
1989 sum += qlp_coeff[ 1] * data[i- 2];
1990 sum += qlp_coeff[ 0] * data[i- 1];
1992 data[i] = residual[i] + (sum >> lp_quantization);
1997 FLAC__SSE_TARGET("sse2")
1998 void FLAC__lpc_restore_signal_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
2000 int i;
2002 FLAC__ASSERT(order > 0);
2003 FLAC__ASSERT(order <= 32);
2005 if(order <= 12) {
2006 if(order > 8) { /* order == 9, 10, 11, 12 */
2007 if(order > 10) { /* order == 11, 12 */
2008 if(order == 12) {
2009 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2010 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
2011 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
2012 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
2013 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
2014 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
2015 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
2017 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
2018 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
2019 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
2020 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
2021 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
2022 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
2024 for(i = 0; i < (int)data_len; i++) {
2025 //sum = 0;
2026 //sum += qlp_coeff[11] * data[i-12];
2027 //sum += qlp_coeff[10] * data[i-11];
2028 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
2029 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
2030 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
2032 //sum += qlp_coeff[9] * data[i-10];
2033 //sum += qlp_coeff[8] * data[i-9];
2034 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2035 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2036 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2037 xmm7 = _mm_add_epi32(xmm7, xmm6);
2039 //sum += qlp_coeff[7] * data[i-8];
2040 //sum += qlp_coeff[6] * data[i-7];
2041 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2042 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2043 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2044 xmm7 = _mm_add_epi32(xmm7, xmm6);
2046 //sum += qlp_coeff[5] * data[i-6];
2047 //sum += qlp_coeff[4] * data[i-5];
2048 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2049 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2050 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2051 xmm7 = _mm_add_epi32(xmm7, xmm6);
2053 //sum += qlp_coeff[3] * data[i-4];
2054 //sum += qlp_coeff[2] * data[i-3];
2055 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2056 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2057 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2058 xmm7 = _mm_add_epi32(xmm7, xmm6);
2060 //sum += qlp_coeff[1] * data[i-2];
2061 //sum += qlp_coeff[0] * data[i-1];
2062 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2063 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2064 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2065 xmm7 = _mm_add_epi32(xmm7, xmm6);
2067 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2068 DATA32_RESULT(xmm7);
2071 else { /* order == 11 */
2072 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
2073 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2074 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2075 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2076 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2077 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2078 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
2080 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2081 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2082 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2083 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2084 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2086 for(i = 0; i < (int)data_len; i++) {
2087 //sum = 0;
2088 //sum = qlp_coeff[10] * data[i-11];
2089 xmm7 = _mm_cvtsi32_si128(data[i-11]);
2090 xmm7 = _mm_mul_epu32(xmm7, xmm5);
2092 //sum += qlp_coeff[9] * data[i-10];
2093 //sum += qlp_coeff[8] * data[i-9];
2094 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2095 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2096 xmm6 = _mm_mul_epu32(xmm6, xmm4);
2097 xmm7 = _mm_add_epi32(xmm7, xmm6);
2099 //sum += qlp_coeff[7] * data[i-8];
2100 //sum += qlp_coeff[6] * data[i-7];
2101 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2102 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2103 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2104 xmm7 = _mm_add_epi32(xmm7, xmm6);
2106 //sum += qlp_coeff[5] * data[i-6];
2107 //sum += qlp_coeff[4] * data[i-5];
2108 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2109 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2110 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2111 xmm7 = _mm_add_epi32(xmm7, xmm6);
2113 //sum += qlp_coeff[3] * data[i-4];
2114 //sum += qlp_coeff[2] * data[i-3];
2115 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2116 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2117 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2118 xmm7 = _mm_add_epi32(xmm7, xmm6);
2120 //sum += qlp_coeff[1] * data[i-2];
2121 //sum += qlp_coeff[0] * data[i-1];
2122 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2123 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2124 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2125 xmm7 = _mm_add_epi32(xmm7, xmm6);
2127 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2128 DATA32_RESULT(xmm7);
2132 else { /* order == 9, 10 */
2133 if(order == 10) {
2134 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2135 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2136 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2137 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2138 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2139 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
2141 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2142 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2143 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2144 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2145 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
2147 for(i = 0; i < (int)data_len; i++) {
2148 //sum = 0;
2149 //sum += qlp_coeff[9] * data[i-10];
2150 //sum += qlp_coeff[8] * data[i-9];
2151 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
2152 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2153 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2155 //sum += qlp_coeff[7] * data[i-8];
2156 //sum += qlp_coeff[6] * data[i-7];
2157 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2158 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2159 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2160 xmm7 = _mm_add_epi32(xmm7, xmm6);
2162 //sum += qlp_coeff[5] * data[i-6];
2163 //sum += qlp_coeff[4] * data[i-5];
2164 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2165 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2166 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2167 xmm7 = _mm_add_epi32(xmm7, xmm6);
2169 //sum += qlp_coeff[3] * data[i-4];
2170 //sum += qlp_coeff[2] * data[i-3];
2171 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2172 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2173 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2174 xmm7 = _mm_add_epi32(xmm7, xmm6);
2176 //sum += qlp_coeff[1] * data[i-2];
2177 //sum += qlp_coeff[0] * data[i-1];
2178 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2179 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2180 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2181 xmm7 = _mm_add_epi32(xmm7, xmm6);
2183 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2184 DATA32_RESULT(xmm7);
2187 else { /* order == 9 */
2188 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
2189 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2190 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2191 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2192 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2193 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
2195 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2196 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2197 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2198 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2200 for(i = 0; i < (int)data_len; i++) {
2201 //sum = 0;
2202 //sum = qlp_coeff[8] * data[i-9];
2203 xmm7 = _mm_cvtsi32_si128(data[i-9]);
2204 xmm7 = _mm_mul_epu32(xmm7, xmm4);
2206 //sum += qlp_coeff[7] * data[i-8];
2207 //sum += qlp_coeff[6] * data[i-7];
2208 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2209 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2210 xmm6 = _mm_mul_epu32(xmm6, xmm3);
2211 xmm7 = _mm_add_epi32(xmm7, xmm6);
2213 //sum += qlp_coeff[5] * data[i-6];
2214 //sum += qlp_coeff[4] * data[i-5];
2215 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2216 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2217 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2218 xmm7 = _mm_add_epi32(xmm7, xmm6);
2220 //sum += qlp_coeff[3] * data[i-4];
2221 //sum += qlp_coeff[2] * data[i-3];
2222 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2223 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2224 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2225 xmm7 = _mm_add_epi32(xmm7, xmm6);
2227 //sum += qlp_coeff[1] * data[i-2];
2228 //sum += qlp_coeff[0] * data[i-1];
2229 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2230 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2231 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2232 xmm7 = _mm_add_epi32(xmm7, xmm6);
2234 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2235 DATA32_RESULT(xmm7);
2240 else if(order > 4) { /* order == 5, 6, 7, 8 */
2241 if(order > 6) { /* order == 7, 8 */
2242 if(order == 8) {
2243 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2244 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2245 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2246 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2247 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
2249 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2250 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2251 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2252 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
2254 for(i = 0; i < (int)data_len; i++) {
2255 //sum = 0;
2256 //sum += qlp_coeff[7] * data[i-8];
2257 //sum += qlp_coeff[6] * data[i-7];
2258 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
2259 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2260 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2262 //sum += qlp_coeff[5] * data[i-6];
2263 //sum += qlp_coeff[4] * data[i-5];
2264 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2265 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2266 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2267 xmm7 = _mm_add_epi32(xmm7, xmm6);
2269 //sum += qlp_coeff[3] * data[i-4];
2270 //sum += qlp_coeff[2] * data[i-3];
2271 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2272 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2273 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2274 xmm7 = _mm_add_epi32(xmm7, xmm6);
2276 //sum += qlp_coeff[1] * data[i-2];
2277 //sum += qlp_coeff[0] * data[i-1];
2278 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2279 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2280 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2281 xmm7 = _mm_add_epi32(xmm7, xmm6);
2283 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2284 DATA32_RESULT(xmm7);
2287 else { /* order == 7 */
2288 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
2289 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2290 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2291 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2292 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
2294 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2295 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2296 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2298 for(i = 0; i < (int)data_len; i++) {
2299 //sum = 0;
2300 //sum = qlp_coeff[6] * data[i-7];
2301 xmm7 = _mm_cvtsi32_si128(data[i-7]);
2302 xmm7 = _mm_mul_epu32(xmm7, xmm3);
2304 //sum += qlp_coeff[5] * data[i-6];
2305 //sum += qlp_coeff[4] * data[i-5];
2306 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2307 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2308 xmm6 = _mm_mul_epu32(xmm6, xmm2);
2309 xmm7 = _mm_add_epi32(xmm7, xmm6);
2311 //sum += qlp_coeff[3] * data[i-4];
2312 //sum += qlp_coeff[2] * data[i-3];
2313 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2314 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2315 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2316 xmm7 = _mm_add_epi32(xmm7, xmm6);
2318 //sum += qlp_coeff[1] * data[i-2];
2319 //sum += qlp_coeff[0] * data[i-1];
2320 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2321 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2322 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2323 xmm7 = _mm_add_epi32(xmm7, xmm6);
2325 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2326 DATA32_RESULT(xmm7);
2330 else { /* order == 5, 6 */
2331 if(order == 6) {
2332 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2333 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2334 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2335 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
2337 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2338 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2339 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
2341 for(i = 0; i < (int)data_len; i++) {
2342 //sum = 0;
2343 //sum += qlp_coeff[5] * data[i-6];
2344 //sum += qlp_coeff[4] * data[i-5];
2345 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
2346 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2347 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2349 //sum += qlp_coeff[3] * data[i-4];
2350 //sum += qlp_coeff[2] * data[i-3];
2351 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2352 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2353 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2354 xmm7 = _mm_add_epi32(xmm7, xmm6);
2356 //sum += qlp_coeff[1] * data[i-2];
2357 //sum += qlp_coeff[0] * data[i-1];
2358 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2359 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2360 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2361 xmm7 = _mm_add_epi32(xmm7, xmm6);
2363 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2364 DATA32_RESULT(xmm7);
2367 else { /* order == 5 */
2368 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
2369 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2370 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2371 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
2373 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2374 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2376 for(i = 0; i < (int)data_len; i++) {
2377 //sum = 0;
2378 //sum = qlp_coeff[4] * data[i-5];
2379 xmm7 = _mm_cvtsi32_si128(data[i-5]);
2380 xmm7 = _mm_mul_epu32(xmm7, xmm2);
2382 //sum += qlp_coeff[3] * data[i-4];
2383 //sum += qlp_coeff[2] * data[i-3];
2384 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2385 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2386 xmm6 = _mm_mul_epu32(xmm6, xmm1);
2387 xmm7 = _mm_add_epi32(xmm7, xmm6);
2389 //sum += qlp_coeff[1] * data[i-2];
2390 //sum += qlp_coeff[0] * data[i-1];
2391 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2392 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2393 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2394 xmm7 = _mm_add_epi32(xmm7, xmm6);
2396 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2397 DATA32_RESULT(xmm7);
2402 else { /* order == 1, 2, 3, 4 */
2403 if(order > 2) { /* order == 3, 4 */
2404 if(order == 4) {
2405 __m128i xmm0, xmm1, xmm6, xmm7;
2406 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2407 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
2409 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2410 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
2412 for(i = 0; i < (int)data_len; i++) {
2413 //sum = 0;
2414 //sum += qlp_coeff[3] * data[i-4];
2415 //sum += qlp_coeff[2] * data[i-3];
2416 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
2417 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2418 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2420 //sum += qlp_coeff[1] * data[i-2];
2421 //sum += qlp_coeff[0] * data[i-1];
2422 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2423 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2424 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2425 xmm7 = _mm_add_epi32(xmm7, xmm6);
2427 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2428 DATA32_RESULT(xmm7);
2431 else { /* order == 3 */
2432 __m128i xmm0, xmm1, xmm6, xmm7;
2433 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2434 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
2436 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2438 for(i = 0; i < (int)data_len; i++) {
2439 //sum = 0;
2440 //sum = qlp_coeff[2] * data[i-3];
2441 xmm7 = _mm_cvtsi32_si128(data[i-3]);
2442 xmm7 = _mm_mul_epu32(xmm7, xmm1);
2444 //sum += qlp_coeff[1] * data[i-2];
2445 //sum += qlp_coeff[0] * data[i-1];
2446 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2447 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
2448 xmm6 = _mm_mul_epu32(xmm6, xmm0);
2449 xmm7 = _mm_add_epi32(xmm7, xmm6);
2451 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2452 DATA32_RESULT(xmm7);
2456 else { /* order == 1, 2 */
2457 if(order == 2) {
2458 __m128i xmm0, xmm7;
2459 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
2460 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
2462 for(i = 0; i < (int)data_len; i++) {
2463 //sum = 0;
2464 //sum += qlp_coeff[1] * data[i-2];
2465 //sum += qlp_coeff[0] * data[i-1];
2466 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
2467 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
2468 xmm7 = _mm_mul_epu32(xmm7, xmm0);
2470 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
2471 DATA32_RESULT(xmm7);
2474 else { /* order == 1 */
2475 for(i = 0; i < (int)data_len; i++)
2476 data[i] = residual[i] + ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
2481 else { /* order > 12 */
2482 FLAC__int32 sum;
2483 for(i = 0; i < (int)data_len; i++) {
2484 sum = 0;
2485 switch(order) {
2486 case 32: sum += qlp_coeff[31] * data[i-32];
2487 case 31: sum += qlp_coeff[30] * data[i-31];
2488 case 30: sum += qlp_coeff[29] * data[i-30];
2489 case 29: sum += qlp_coeff[28] * data[i-29];
2490 case 28: sum += qlp_coeff[27] * data[i-28];
2491 case 27: sum += qlp_coeff[26] * data[i-27];
2492 case 26: sum += qlp_coeff[25] * data[i-26];
2493 case 25: sum += qlp_coeff[24] * data[i-25];
2494 case 24: sum += qlp_coeff[23] * data[i-24];
2495 case 23: sum += qlp_coeff[22] * data[i-23];
2496 case 22: sum += qlp_coeff[21] * data[i-22];
2497 case 21: sum += qlp_coeff[20] * data[i-21];
2498 case 20: sum += qlp_coeff[19] * data[i-20];
2499 case 19: sum += qlp_coeff[18] * data[i-19];
2500 case 18: sum += qlp_coeff[17] * data[i-18];
2501 case 17: sum += qlp_coeff[16] * data[i-17];
2502 case 16: sum += qlp_coeff[15] * data[i-16];
2503 case 15: sum += qlp_coeff[14] * data[i-15];
2504 case 14: sum += qlp_coeff[13] * data[i-14];
2505 case 13: sum += qlp_coeff[12] * data[i-13];
2506 sum += qlp_coeff[11] * data[i-12];
2507 sum += qlp_coeff[10] * data[i-11];
2508 sum += qlp_coeff[ 9] * data[i-10];
2509 sum += qlp_coeff[ 8] * data[i- 9];
2510 sum += qlp_coeff[ 7] * data[i- 8];
2511 sum += qlp_coeff[ 6] * data[i- 7];
2512 sum += qlp_coeff[ 5] * data[i- 6];
2513 sum += qlp_coeff[ 4] * data[i- 5];
2514 sum += qlp_coeff[ 3] * data[i- 4];
2515 sum += qlp_coeff[ 2] * data[i- 3];
2516 sum += qlp_coeff[ 1] * data[i- 2];
2517 sum += qlp_coeff[ 0] * data[i- 1];
2519 data[i] = residual[i] + (sum >> lp_quantization);
2524 #endif /* FLAC__SSE2_SUPPORTED */
2525 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
2526 #endif /* FLAC__NO_ASM */
2527 #endif /* FLAC__INTEGER_ONLY_LIBRARY */