1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
46 #include <emmintrin.h> /* SSE2 */
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32
*data
, unsigned data_len
, const FLAC__int32 qlp_coeff
[], unsigned order
, int lp_quantization
, FLAC__int32 residual
[])
59 __m128i cnt
= _mm_cvtsi32_si128(lp_quantization
);
61 FLAC__ASSERT(order
> 0);
62 FLAC__ASSERT(order
<= 32);
68 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
, q7
, q8
, q9
, q10
, q11
;
69 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
70 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
71 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
72 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
73 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
74 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
75 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
76 q7
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[7]); q7
= _mm_shuffle_epi32(q7
, _MM_SHUFFLE(0,0,0,0));
77 q8
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[8]); q8
= _mm_shuffle_epi32(q8
, _MM_SHUFFLE(0,0,0,0));
78 q9
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[9]); q9
= _mm_shuffle_epi32(q9
, _MM_SHUFFLE(0,0,0,0));
79 q10
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[10]); q10
= _mm_shuffle_epi32(q10
, _MM_SHUFFLE(0,0,0,0));
80 q11
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[11]); q11
= _mm_shuffle_epi32(q11
, _MM_SHUFFLE(0,0,0,0));
82 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
84 summ
= _mm_madd_epi16(q11
, _mm_loadu_si128((const __m128i
*)(data
+i
-12)));
85 mull
= _mm_madd_epi16(q10
, _mm_loadu_si128((const __m128i
*)(data
+i
-11))); summ
= _mm_add_epi32(summ
, mull
);
86 mull
= _mm_madd_epi16(q9
, _mm_loadu_si128((const __m128i
*)(data
+i
-10))); summ
= _mm_add_epi32(summ
, mull
);
87 mull
= _mm_madd_epi16(q8
, _mm_loadu_si128((const __m128i
*)(data
+i
-9))); summ
= _mm_add_epi32(summ
, mull
);
88 mull
= _mm_madd_epi16(q7
, _mm_loadu_si128((const __m128i
*)(data
+i
-8))); summ
= _mm_add_epi32(summ
, mull
);
89 mull
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7))); summ
= _mm_add_epi32(summ
, mull
);
90 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
91 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
92 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
93 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
94 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
95 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
96 summ
= _mm_sra_epi32(summ
, cnt
);
97 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
100 else { /* order == 11 */
101 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
, q7
, q8
, q9
, q10
;
102 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
103 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
104 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
105 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
106 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
107 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
108 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
109 q7
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[7]); q7
= _mm_shuffle_epi32(q7
, _MM_SHUFFLE(0,0,0,0));
110 q8
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[8]); q8
= _mm_shuffle_epi32(q8
, _MM_SHUFFLE(0,0,0,0));
111 q9
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[9]); q9
= _mm_shuffle_epi32(q9
, _MM_SHUFFLE(0,0,0,0));
112 q10
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[10]); q10
= _mm_shuffle_epi32(q10
, _MM_SHUFFLE(0,0,0,0));
114 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
116 summ
= _mm_madd_epi16(q10
, _mm_loadu_si128((const __m128i
*)(data
+i
-11)));
117 mull
= _mm_madd_epi16(q9
, _mm_loadu_si128((const __m128i
*)(data
+i
-10))); summ
= _mm_add_epi32(summ
, mull
);
118 mull
= _mm_madd_epi16(q8
, _mm_loadu_si128((const __m128i
*)(data
+i
-9))); summ
= _mm_add_epi32(summ
, mull
);
119 mull
= _mm_madd_epi16(q7
, _mm_loadu_si128((const __m128i
*)(data
+i
-8))); summ
= _mm_add_epi32(summ
, mull
);
120 mull
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7))); summ
= _mm_add_epi32(summ
, mull
);
121 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
122 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
123 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
124 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
125 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
126 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
127 summ
= _mm_sra_epi32(summ
, cnt
);
128 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
134 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
, q7
, q8
, q9
;
135 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
136 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
137 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
138 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
139 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
140 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
141 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
142 q7
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[7]); q7
= _mm_shuffle_epi32(q7
, _MM_SHUFFLE(0,0,0,0));
143 q8
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[8]); q8
= _mm_shuffle_epi32(q8
, _MM_SHUFFLE(0,0,0,0));
144 q9
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[9]); q9
= _mm_shuffle_epi32(q9
, _MM_SHUFFLE(0,0,0,0));
146 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
148 summ
= _mm_madd_epi16(q9
, _mm_loadu_si128((const __m128i
*)(data
+i
-10)));
149 mull
= _mm_madd_epi16(q8
, _mm_loadu_si128((const __m128i
*)(data
+i
-9))); summ
= _mm_add_epi32(summ
, mull
);
150 mull
= _mm_madd_epi16(q7
, _mm_loadu_si128((const __m128i
*)(data
+i
-8))); summ
= _mm_add_epi32(summ
, mull
);
151 mull
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7))); summ
= _mm_add_epi32(summ
, mull
);
152 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
153 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
154 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
155 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
156 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
157 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
158 summ
= _mm_sra_epi32(summ
, cnt
);
159 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
162 else { /* order == 9 */
163 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
, q7
, q8
;
164 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
165 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
166 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
167 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
168 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
169 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
170 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
171 q7
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[7]); q7
= _mm_shuffle_epi32(q7
, _MM_SHUFFLE(0,0,0,0));
172 q8
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[8]); q8
= _mm_shuffle_epi32(q8
, _MM_SHUFFLE(0,0,0,0));
174 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
176 summ
= _mm_madd_epi16(q8
, _mm_loadu_si128((const __m128i
*)(data
+i
-9)));
177 mull
= _mm_madd_epi16(q7
, _mm_loadu_si128((const __m128i
*)(data
+i
-8))); summ
= _mm_add_epi32(summ
, mull
);
178 mull
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7))); summ
= _mm_add_epi32(summ
, mull
);
179 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
180 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
181 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
182 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
183 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
184 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
185 summ
= _mm_sra_epi32(summ
, cnt
);
186 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
194 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
, q7
;
195 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
196 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
197 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
198 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
199 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
200 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
201 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
202 q7
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[7]); q7
= _mm_shuffle_epi32(q7
, _MM_SHUFFLE(0,0,0,0));
204 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
206 summ
= _mm_madd_epi16(q7
, _mm_loadu_si128((const __m128i
*)(data
+i
-8)));
207 mull
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7))); summ
= _mm_add_epi32(summ
, mull
);
208 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
209 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
210 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
211 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
212 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
213 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
214 summ
= _mm_sra_epi32(summ
, cnt
);
215 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
218 else { /* order == 7 */
219 __m128i q0
, q1
, q2
, q3
, q4
, q5
, q6
;
220 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
221 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
222 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
223 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
224 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
225 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
226 q6
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[6]); q6
= _mm_shuffle_epi32(q6
, _MM_SHUFFLE(0,0,0,0));
228 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
230 summ
= _mm_madd_epi16(q6
, _mm_loadu_si128((const __m128i
*)(data
+i
-7)));
231 mull
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6))); summ
= _mm_add_epi32(summ
, mull
);
232 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
233 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
234 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
235 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
236 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
237 summ
= _mm_sra_epi32(summ
, cnt
);
238 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
244 __m128i q0
, q1
, q2
, q3
, q4
, q5
;
245 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
246 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
247 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
248 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
249 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
250 q5
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[5]); q5
= _mm_shuffle_epi32(q5
, _MM_SHUFFLE(0,0,0,0));
252 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
254 summ
= _mm_madd_epi16(q5
, _mm_loadu_si128((const __m128i
*)(data
+i
-6)));
255 mull
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5))); summ
= _mm_add_epi32(summ
, mull
);
256 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
257 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
258 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
259 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
260 summ
= _mm_sra_epi32(summ
, cnt
);
261 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
264 else { /* order == 5 */
265 __m128i q0
, q1
, q2
, q3
, q4
;
266 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
267 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
268 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
269 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
270 q4
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[4]); q4
= _mm_shuffle_epi32(q4
, _MM_SHUFFLE(0,0,0,0));
272 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
274 summ
= _mm_madd_epi16(q4
, _mm_loadu_si128((const __m128i
*)(data
+i
-5)));
275 mull
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4))); summ
= _mm_add_epi32(summ
, mull
);
276 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
277 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
278 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
279 summ
= _mm_sra_epi32(summ
, cnt
);
280 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
288 __m128i q0
, q1
, q2
, q3
;
289 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
290 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
291 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
292 q3
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[3]); q3
= _mm_shuffle_epi32(q3
, _MM_SHUFFLE(0,0,0,0));
294 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
296 summ
= _mm_madd_epi16(q3
, _mm_loadu_si128((const __m128i
*)(data
+i
-4)));
297 mull
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3))); summ
= _mm_add_epi32(summ
, mull
);
298 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
299 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
300 summ
= _mm_sra_epi32(summ
, cnt
);
301 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
304 else { /* order == 3 */
306 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
307 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
308 q2
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[2]); q2
= _mm_shuffle_epi32(q2
, _MM_SHUFFLE(0,0,0,0));
310 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
312 summ
= _mm_madd_epi16(q2
, _mm_loadu_si128((const __m128i
*)(data
+i
-3)));
313 mull
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2))); summ
= _mm_add_epi32(summ
, mull
);
314 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
315 summ
= _mm_sra_epi32(summ
, cnt
);
316 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
323 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
324 q1
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[1]); q1
= _mm_shuffle_epi32(q1
, _MM_SHUFFLE(0,0,0,0));
326 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
328 summ
= _mm_madd_epi16(q1
, _mm_loadu_si128((const __m128i
*)(data
+i
-2)));
329 mull
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1))); summ
= _mm_add_epi32(summ
, mull
);
330 summ
= _mm_sra_epi32(summ
, cnt
);
331 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
334 else { /* order == 1 */
336 q0
= _mm_cvtsi32_si128(0xffff & qlp_coeff
[0]); q0
= _mm_shuffle_epi32(q0
, _MM_SHUFFLE(0,0,0,0));
338 for(i
= 0; i
< (int)data_len
-3; i
+=4) {
340 summ
= _mm_madd_epi16(q0
, _mm_loadu_si128((const __m128i
*)(data
+i
-1)));
341 summ
= _mm_sra_epi32(summ
, cnt
);
342 _mm_storeu_si128((__m128i
*)(residual
+i
), _mm_sub_epi32(_mm_loadu_si128((const __m128i
*)(data
+i
)), summ
));
347 for(; i
< (int)data_len
; i
++) {
350 case 12: sum
+= qlp_coeff
[11] * data
[i
-12];
351 case 11: sum
+= qlp_coeff
[10] * data
[i
-11];
352 case 10: sum
+= qlp_coeff
[ 9] * data
[i
-10];
353 case 9: sum
+= qlp_coeff
[ 8] * data
[i
- 9];
354 case 8: sum
+= qlp_coeff
[ 7] * data
[i
- 8];
355 case 7: sum
+= qlp_coeff
[ 6] * data
[i
- 7];
356 case 6: sum
+= qlp_coeff
[ 5] * data
[i
- 6];
357 case 5: sum
+= qlp_coeff
[ 4] * data
[i
- 5];
358 case 4: sum
+= qlp_coeff
[ 3] * data
[i
- 4];
359 case 3: sum
+= qlp_coeff
[ 2] * data
[i
- 3];
360 case 2: sum
+= qlp_coeff
[ 1] * data
[i
- 2];
361 case 1: sum
+= qlp_coeff
[ 0] * data
[i
- 1];
363 residual
[i
] = data
[i
] - (sum
>> lp_quantization
);
366 else { /* order > 12 */
367 for(i
= 0; i
< (int)data_len
; i
++) {
370 case 32: sum
+= qlp_coeff
[31] * data
[i
-32];
371 case 31: sum
+= qlp_coeff
[30] * data
[i
-31];
372 case 30: sum
+= qlp_coeff
[29] * data
[i
-30];
373 case 29: sum
+= qlp_coeff
[28] * data
[i
-29];
374 case 28: sum
+= qlp_coeff
[27] * data
[i
-28];
375 case 27: sum
+= qlp_coeff
[26] * data
[i
-27];
376 case 26: sum
+= qlp_coeff
[25] * data
[i
-26];
377 case 25: sum
+= qlp_coeff
[24] * data
[i
-25];
378 case 24: sum
+= qlp_coeff
[23] * data
[i
-24];
379 case 23: sum
+= qlp_coeff
[22] * data
[i
-23];
380 case 22: sum
+= qlp_coeff
[21] * data
[i
-22];
381 case 21: sum
+= qlp_coeff
[20] * data
[i
-21];
382 case 20: sum
+= qlp_coeff
[19] * data
[i
-20];
383 case 19: sum
+= qlp_coeff
[18] * data
[i
-19];
384 case 18: sum
+= qlp_coeff
[17] * data
[i
-18];
385 case 17: sum
+= qlp_coeff
[16] * data
[i
-17];
386 case 16: sum
+= qlp_coeff
[15] * data
[i
-16];
387 case 15: sum
+= qlp_coeff
[14] * data
[i
-15];
388 case 14: sum
+= qlp_coeff
[13] * data
[i
-14];
389 case 13: sum
+= qlp_coeff
[12] * data
[i
-13];
390 sum
+= qlp_coeff
[11] * data
[i
-12];
391 sum
+= qlp_coeff
[10] * data
[i
-11];
392 sum
+= qlp_coeff
[ 9] * data
[i
-10];
393 sum
+= qlp_coeff
[ 8] * data
[i
- 9];
394 sum
+= qlp_coeff
[ 7] * data
[i
- 8];
395 sum
+= qlp_coeff
[ 6] * data
[i
- 7];
396 sum
+= qlp_coeff
[ 5] * data
[i
- 6];
397 sum
+= qlp_coeff
[ 4] * data
[i
- 5];
398 sum
+= qlp_coeff
[ 3] * data
[i
- 4];
399 sum
+= qlp_coeff
[ 2] * data
[i
- 3];
400 sum
+= qlp_coeff
[ 1] * data
[i
- 2];
401 sum
+= qlp_coeff
[ 0] * data
[i
- 1];
403 residual
[i
] = data
[i
] - (sum
>> lp_quantization
);
408 FLAC__SSE_TARGET("sse2")
409 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32
*data
, unsigned data_len
, const FLAC__int32 qlp_coeff
[], unsigned order
, int lp_quantization
, FLAC__int32 residual
[])
413 FLAC__ASSERT(order
> 0);
414 FLAC__ASSERT(order
<= 32);
417 if(order
> 8) { /* order == 9, 10, 11, 12 */
418 if(order
> 10) { /* order == 11, 12 */
420 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
421 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0)); // 0 0 q[1] q[0]
422 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2)); // 0 0 q[3] q[2]
423 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4)); // 0 0 q[5] q[4]
424 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6)); // 0 0 q[7] q[6]
425 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8)); // 0 0 q[9] q[8]
426 xmm5
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+10)); // 0 0 q[11] q[10]
428 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
429 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
430 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
431 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
432 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
433 xmm5
= _mm_shuffle_epi32(xmm5
, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
435 for(i
= 0; i
< (int)data_len
; i
++) {
437 //sum += qlp_coeff[11] * data[i-12];
438 //sum += qlp_coeff[10] * data[i-11];
439 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-12)); // 0 0 d[i-11] d[i-12]
440 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
441 xmm7
= _mm_mul_epu32(xmm7
, xmm5
); /* we use _unsigned_ multiplication and discard high dword of the result values */
443 //sum += qlp_coeff[9] * data[i-10];
444 //sum += qlp_coeff[8] * data[i-9];
445 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
446 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
447 xmm6
= _mm_mul_epu32(xmm6
, xmm4
);
448 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
450 //sum += qlp_coeff[7] * data[i-8];
451 //sum += qlp_coeff[6] * data[i-7];
452 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
453 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
454 xmm6
= _mm_mul_epu32(xmm6
, xmm3
);
455 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
457 //sum += qlp_coeff[5] * data[i-6];
458 //sum += qlp_coeff[4] * data[i-5];
459 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
460 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
461 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
462 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
464 //sum += qlp_coeff[3] * data[i-4];
465 //sum += qlp_coeff[2] * data[i-3];
466 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
467 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
468 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
469 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
471 //sum += qlp_coeff[1] * data[i-2];
472 //sum += qlp_coeff[0] * data[i-1];
473 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
474 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
475 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
476 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
478 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
479 RESIDUAL32_RESULT(xmm7
);
482 else { /* order == 11 */
483 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
484 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
485 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
486 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
487 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
488 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
489 xmm5
= _mm_cvtsi32_si128(qlp_coeff
[10]);
491 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
492 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
493 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
494 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
495 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
497 for(i
= 0; i
< (int)data_len
; i
++) {
499 //sum = qlp_coeff[10] * data[i-11];
500 xmm7
= _mm_cvtsi32_si128(data
[i
-11]);
501 xmm7
= _mm_mul_epu32(xmm7
, xmm5
);
503 //sum += qlp_coeff[9] * data[i-10];
504 //sum += qlp_coeff[8] * data[i-9];
505 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
506 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
507 xmm6
= _mm_mul_epu32(xmm6
, xmm4
);
508 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
510 //sum += qlp_coeff[7] * data[i-8];
511 //sum += qlp_coeff[6] * data[i-7];
512 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
513 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
514 xmm6
= _mm_mul_epu32(xmm6
, xmm3
);
515 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
517 //sum += qlp_coeff[5] * data[i-6];
518 //sum += qlp_coeff[4] * data[i-5];
519 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
520 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
521 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
522 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
524 //sum += qlp_coeff[3] * data[i-4];
525 //sum += qlp_coeff[2] * data[i-3];
526 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
527 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
528 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
529 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
531 //sum += qlp_coeff[1] * data[i-2];
532 //sum += qlp_coeff[0] * data[i-1];
533 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
534 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
535 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
536 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
538 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
539 RESIDUAL32_RESULT(xmm7
);
543 else { /* order == 9, 10 */
545 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
546 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
547 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
548 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
549 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
550 xmm4
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+8));
552 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
553 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
554 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
555 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
556 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(3,1,2,0));
558 for(i
= 0; i
< (int)data_len
; i
++) {
560 //sum += qlp_coeff[9] * data[i-10];
561 //sum += qlp_coeff[8] * data[i-9];
562 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-10));
563 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
564 xmm7
= _mm_mul_epu32(xmm7
, xmm4
);
566 //sum += qlp_coeff[7] * data[i-8];
567 //sum += qlp_coeff[6] * data[i-7];
568 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
569 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
570 xmm6
= _mm_mul_epu32(xmm6
, xmm3
);
571 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
573 //sum += qlp_coeff[5] * data[i-6];
574 //sum += qlp_coeff[4] * data[i-5];
575 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
576 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
577 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
578 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
580 //sum += qlp_coeff[3] * data[i-4];
581 //sum += qlp_coeff[2] * data[i-3];
582 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
583 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
584 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
585 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
587 //sum += qlp_coeff[1] * data[i-2];
588 //sum += qlp_coeff[0] * data[i-1];
589 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
590 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
591 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
592 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
594 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
595 RESIDUAL32_RESULT(xmm7
);
598 else { /* order == 9 */
599 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm6
, xmm7
;
600 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
601 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
602 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
603 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
604 xmm4
= _mm_cvtsi32_si128(qlp_coeff
[8]);
606 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
607 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
608 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
609 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
611 for(i
= 0; i
< (int)data_len
; i
++) {
613 //sum = qlp_coeff[8] * data[i-9];
614 xmm7
= _mm_cvtsi32_si128(data
[i
-9]);
615 xmm7
= _mm_mul_epu32(xmm7
, xmm4
);
617 //sum += qlp_coeff[7] * data[i-8];
618 //sum += qlp_coeff[6] * data[i-7];
619 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
620 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
621 xmm6
= _mm_mul_epu32(xmm6
, xmm3
);
622 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
624 //sum += qlp_coeff[5] * data[i-6];
625 //sum += qlp_coeff[4] * data[i-5];
626 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
627 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
628 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
629 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
631 //sum += qlp_coeff[3] * data[i-4];
632 //sum += qlp_coeff[2] * data[i-3];
633 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
634 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
635 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
636 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
638 //sum += qlp_coeff[1] * data[i-2];
639 //sum += qlp_coeff[0] * data[i-1];
640 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
641 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
642 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
643 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
645 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
646 RESIDUAL32_RESULT(xmm7
);
651 else if(order
> 4) { /* order == 5, 6, 7, 8 */
652 if(order
> 6) { /* order == 7, 8 */
654 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
655 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
656 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
657 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
658 xmm3
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+6));
660 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
661 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
662 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
663 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(3,1,2,0));
665 for(i
= 0; i
< (int)data_len
; i
++) {
667 //sum += qlp_coeff[7] * data[i-8];
668 //sum += qlp_coeff[6] * data[i-7];
669 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-8));
670 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
671 xmm7
= _mm_mul_epu32(xmm7
, xmm3
);
673 //sum += qlp_coeff[5] * data[i-6];
674 //sum += qlp_coeff[4] * data[i-5];
675 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
676 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
677 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
678 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
680 //sum += qlp_coeff[3] * data[i-4];
681 //sum += qlp_coeff[2] * data[i-3];
682 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
683 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
684 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
685 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
687 //sum += qlp_coeff[1] * data[i-2];
688 //sum += qlp_coeff[0] * data[i-1];
689 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
690 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
691 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
692 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
694 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
695 RESIDUAL32_RESULT(xmm7
);
698 else { /* order == 7 */
699 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm6
, xmm7
;
700 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
701 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
702 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
703 xmm3
= _mm_cvtsi32_si128(qlp_coeff
[6]);
705 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
706 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
707 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
709 for(i
= 0; i
< (int)data_len
; i
++) {
711 //sum = qlp_coeff[6] * data[i-7];
712 xmm7
= _mm_cvtsi32_si128(data
[i
-7]);
713 xmm7
= _mm_mul_epu32(xmm7
, xmm3
);
715 //sum += qlp_coeff[5] * data[i-6];
716 //sum += qlp_coeff[4] * data[i-5];
717 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
718 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
719 xmm6
= _mm_mul_epu32(xmm6
, xmm2
);
720 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
722 //sum += qlp_coeff[3] * data[i-4];
723 //sum += qlp_coeff[2] * data[i-3];
724 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
725 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
726 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
727 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
729 //sum += qlp_coeff[1] * data[i-2];
730 //sum += qlp_coeff[0] * data[i-1];
731 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
732 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
733 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
734 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
736 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
737 RESIDUAL32_RESULT(xmm7
);
741 else { /* order == 5, 6 */
743 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
744 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
745 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
746 xmm2
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+4));
748 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
749 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
750 xmm2
= _mm_shuffle_epi32(xmm2
, _MM_SHUFFLE(3,1,2,0));
752 for(i
= 0; i
< (int)data_len
; i
++) {
754 //sum += qlp_coeff[5] * data[i-6];
755 //sum += qlp_coeff[4] * data[i-5];
756 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-6));
757 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
758 xmm7
= _mm_mul_epu32(xmm7
, xmm2
);
760 //sum += qlp_coeff[3] * data[i-4];
761 //sum += qlp_coeff[2] * data[i-3];
762 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
763 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
764 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
765 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
767 //sum += qlp_coeff[1] * data[i-2];
768 //sum += qlp_coeff[0] * data[i-1];
769 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
770 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
771 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
772 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
774 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
775 RESIDUAL32_RESULT(xmm7
);
778 else { /* order == 5 */
779 __m128i xmm0
, xmm1
, xmm2
, xmm6
, xmm7
;
780 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
781 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
782 xmm2
= _mm_cvtsi32_si128(qlp_coeff
[4]);
784 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
785 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
787 for(i
= 0; i
< (int)data_len
; i
++) {
789 //sum = qlp_coeff[4] * data[i-5];
790 xmm7
= _mm_cvtsi32_si128(data
[i
-5]);
791 xmm7
= _mm_mul_epu32(xmm7
, xmm2
);
793 //sum += qlp_coeff[3] * data[i-4];
794 //sum += qlp_coeff[2] * data[i-3];
795 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
796 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
797 xmm6
= _mm_mul_epu32(xmm6
, xmm1
);
798 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
800 //sum += qlp_coeff[1] * data[i-2];
801 //sum += qlp_coeff[0] * data[i-1];
802 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
803 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
804 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
805 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
807 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
808 RESIDUAL32_RESULT(xmm7
);
813 else { /* order == 1, 2, 3, 4 */
814 if(order
> 2) { /* order == 3, 4 */
816 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
817 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
818 xmm1
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+2));
820 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
821 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(3,1,2,0));
823 for(i
= 0; i
< (int)data_len
; i
++) {
825 //sum += qlp_coeff[3] * data[i-4];
826 //sum += qlp_coeff[2] * data[i-3];
827 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-4));
828 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
829 xmm7
= _mm_mul_epu32(xmm7
, xmm1
);
831 //sum += qlp_coeff[1] * data[i-2];
832 //sum += qlp_coeff[0] * data[i-1];
833 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
834 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
835 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
836 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
838 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
839 RESIDUAL32_RESULT(xmm7
);
842 else { /* order == 3 */
843 __m128i xmm0
, xmm1
, xmm6
, xmm7
;
844 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
845 xmm1
= _mm_cvtsi32_si128(qlp_coeff
[2]);
847 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
849 for(i
= 0; i
< (int)data_len
; i
++) {
851 //sum = qlp_coeff[2] * data[i-3];
852 xmm7
= _mm_cvtsi32_si128(data
[i
-3]);
853 xmm7
= _mm_mul_epu32(xmm7
, xmm1
);
855 //sum += qlp_coeff[1] * data[i-2];
856 //sum += qlp_coeff[0] * data[i-1];
857 xmm6
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
858 xmm6
= _mm_shuffle_epi32(xmm6
, _MM_SHUFFLE(2,0,3,1));
859 xmm6
= _mm_mul_epu32(xmm6
, xmm0
);
860 xmm7
= _mm_add_epi32(xmm7
, xmm6
);
862 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
863 RESIDUAL32_RESULT(xmm7
);
867 else { /* order == 1, 2 */
870 xmm0
= _mm_loadl_epi64((const __m128i
*)(qlp_coeff
+0));
871 xmm0
= _mm_shuffle_epi32(xmm0
, _MM_SHUFFLE(3,1,2,0));
873 for(i
= 0; i
< (int)data_len
; i
++) {
875 //sum += qlp_coeff[1] * data[i-2];
876 //sum += qlp_coeff[0] * data[i-1];
877 xmm7
= _mm_loadl_epi64((const __m128i
*)(data
+i
-2));
878 xmm7
= _mm_shuffle_epi32(xmm7
, _MM_SHUFFLE(2,0,3,1));
879 xmm7
= _mm_mul_epu32(xmm7
, xmm0
);
881 xmm7
= _mm_add_epi32(xmm7
, _mm_srli_si128(xmm7
, 8));
882 RESIDUAL32_RESULT(xmm7
);
885 else { /* order == 1 */
886 for(i
= 0; i
< (int)data_len
; i
++)
887 residual
[i
] = data
[i
] - ((qlp_coeff
[0] * data
[i
-1]) >> lp_quantization
);
892 else { /* order > 12 */
894 for(i
= 0; i
< (int)data_len
; i
++) {
897 case 32: sum
+= qlp_coeff
[31] * data
[i
-32];
898 case 31: sum
+= qlp_coeff
[30] * data
[i
-31];
899 case 30: sum
+= qlp_coeff
[29] * data
[i
-30];
900 case 29: sum
+= qlp_coeff
[28] * data
[i
-29];
901 case 28: sum
+= qlp_coeff
[27] * data
[i
-28];
902 case 27: sum
+= qlp_coeff
[26] * data
[i
-27];
903 case 26: sum
+= qlp_coeff
[25] * data
[i
-26];
904 case 25: sum
+= qlp_coeff
[24] * data
[i
-25];
905 case 24: sum
+= qlp_coeff
[23] * data
[i
-24];
906 case 23: sum
+= qlp_coeff
[22] * data
[i
-23];
907 case 22: sum
+= qlp_coeff
[21] * data
[i
-22];
908 case 21: sum
+= qlp_coeff
[20] * data
[i
-21];
909 case 20: sum
+= qlp_coeff
[19] * data
[i
-20];
910 case 19: sum
+= qlp_coeff
[18] * data
[i
-19];
911 case 18: sum
+= qlp_coeff
[17] * data
[i
-18];
912 case 17: sum
+= qlp_coeff
[16] * data
[i
-17];
913 case 16: sum
+= qlp_coeff
[15] * data
[i
-16];
914 case 15: sum
+= qlp_coeff
[14] * data
[i
-15];
915 case 14: sum
+= qlp_coeff
[13] * data
[i
-14];
916 case 13: sum
+= qlp_coeff
[12] * data
[i
-13];
917 sum
+= qlp_coeff
[11] * data
[i
-12];
918 sum
+= qlp_coeff
[10] * data
[i
-11];
919 sum
+= qlp_coeff
[ 9] * data
[i
-10];
920 sum
+= qlp_coeff
[ 8] * data
[i
- 9];
921 sum
+= qlp_coeff
[ 7] * data
[i
- 8];
922 sum
+= qlp_coeff
[ 6] * data
[i
- 7];
923 sum
+= qlp_coeff
[ 5] * data
[i
- 6];
924 sum
+= qlp_coeff
[ 4] * data
[i
- 5];
925 sum
+= qlp_coeff
[ 3] * data
[i
- 4];
926 sum
+= qlp_coeff
[ 2] * data
[i
- 3];
927 sum
+= qlp_coeff
[ 1] * data
[i
- 2];
928 sum
+= qlp_coeff
[ 0] * data
[i
- 1];
930 residual
[i
] = data
[i
] - (sum
>> lp_quantization
);
935 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
937 FLAC__SSE_TARGET("sse2")
938 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual
[], unsigned data_len
, const FLAC__int32 qlp_coeff
[], unsigned order
, int lp_quantization
, FLAC__int32 data
[])
940 if (order
< 8 || order
> 12) {
941 FLAC__lpc_restore_signal(residual
, data_len
, qlp_coeff
, order
, lp_quantization
, data
);
947 FLAC__ASSERT(order
>= 8);
948 FLAC__ASSERT(order
<= 12);
950 if(order
> 8) { /* order == 9, 10, 11, 12 */
952 __m128i xmm0
, xmm1
, xmm2
, xmm3
, xmm4
, xmm5
, xmm6
, xmm7
;
953 xmm0
= _mm_loadu_si128((const __m128i
*)(qlp_coeff
+0));
954 xmm6
= _mm_loadu_si128((const __m128i
*)(qlp_coeff
+4));
955 xmm1
= _mm_loadu_si128((const __m128i
*)(qlp_coeff
+8)); /* read 0 to 3 uninitialized coeffs... */
956 switch(order
) /* ...and zero them out */
959 xmm1
= _mm_slli_si128(xmm1
, 12); xmm1
= _mm_srli_si128(xmm1
, 12); break;
961 xmm1
= _mm_slli_si128(xmm1
, 8); xmm1
= _mm_srli_si128(xmm1
, 8); break;
963 xmm1
= _mm_slli_si128(xmm1
, 4); xmm1
= _mm_srli_si128(xmm1
, 4); break;
965 xmm2
= _mm_setzero_si128();
966 xmm0
= _mm_packs_epi32(xmm0
, xmm6
);
967 xmm1
= _mm_packs_epi32(xmm1
, xmm2
);
969 xmm4
= _mm_loadu_si128((const __m128i
*)(data
-12));
970 xmm5
= _mm_loadu_si128((const __m128i
*)(data
-8));
971 xmm3
= _mm_loadu_si128((const __m128i
*)(data
-4));
972 xmm4
= _mm_shuffle_epi32(xmm4
, _MM_SHUFFLE(0,1,2,3));
973 xmm5
= _mm_shuffle_epi32(xmm5
, _MM_SHUFFLE(0,1,2,3));
974 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(0,1,2,3));
975 xmm4
= _mm_packs_epi32(xmm4
, xmm2
);
976 xmm3
= _mm_packs_epi32(xmm3
, xmm5
);
978 xmm7
= _mm_slli_si128(xmm1
, 2);
979 xmm7
= _mm_or_si128(xmm7
, _mm_srli_si128(xmm0
, 14));
980 xmm2
= _mm_slli_si128(xmm0
, 2);
982 /* xmm0, xmm1: qlp_coeff
983 xmm2, xmm7: qlp_coeff << 16 bit
986 xmm5
= _mm_madd_epi16(xmm4
, xmm1
);
987 xmm6
= _mm_madd_epi16(xmm3
, xmm0
);
988 xmm6
= _mm_add_epi32(xmm6
, xmm5
);
989 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
990 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
997 xmm6
= _mm_srli_si128(xmm3
, 14);
998 xmm4
= _mm_slli_si128(xmm4
, 2);
999 xmm3
= _mm_slli_si128(xmm3
, 2);
1000 xmm4
= _mm_or_si128(xmm4
, xmm6
);
1001 xmm3
= _mm_insert_epi16(xmm3
, curr
, 0);
1003 xmm5
= _mm_madd_epi16(xmm4
, xmm1
);
1004 xmm6
= _mm_madd_epi16(xmm3
, xmm0
);
1005 xmm6
= _mm_add_epi32(xmm6
, xmm5
);
1006 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
1007 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
1009 DATA16_RESULT(xmm6
);
1014 while(data_len
) { /* data_len is a multiple of 2 */
1015 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1016 xmm6
= _mm_srli_si128(xmm3
, 12);
1017 xmm4
= _mm_slli_si128(xmm4
, 4);
1018 xmm3
= _mm_slli_si128(xmm3
, 4);
1019 xmm4
= _mm_or_si128(xmm4
, xmm6
);
1020 xmm3
= _mm_insert_epi16(xmm3
, curr
, 1);
1022 xmm5
= _mm_madd_epi16(xmm4
, xmm7
);
1023 xmm6
= _mm_madd_epi16(xmm3
, xmm2
);
1024 xmm6
= _mm_add_epi32(xmm6
, xmm5
);
1025 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
1026 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
1028 DATA16_RESULT(xmm6
);
1030 xmm3
= _mm_insert_epi16(xmm3
, curr
, 0);
1032 xmm5
= _mm_madd_epi16(xmm4
, xmm1
);
1033 xmm6
= _mm_madd_epi16(xmm3
, xmm0
);
1034 xmm6
= _mm_add_epi32(xmm6
, xmm5
);
1035 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
1036 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
1038 DATA16_RESULT(xmm6
);
1042 } /* endif(order > 8) */
1046 __m128i xmm0
, xmm1
, xmm3
, xmm6
;
1047 xmm0
= _mm_loadu_si128((const __m128i
*)(qlp_coeff
+0));
1048 xmm1
= _mm_loadu_si128((const __m128i
*)(qlp_coeff
+4));
1049 xmm0
= _mm_packs_epi32(xmm0
, xmm1
);
1051 xmm1
= _mm_loadu_si128((const __m128i
*)(data
-8));
1052 xmm3
= _mm_loadu_si128((const __m128i
*)(data
-4));
1053 xmm1
= _mm_shuffle_epi32(xmm1
, _MM_SHUFFLE(0,1,2,3));
1054 xmm3
= _mm_shuffle_epi32(xmm3
, _MM_SHUFFLE(0,1,2,3));
1055 xmm3
= _mm_packs_epi32(xmm3
, xmm1
);
1060 xmm6
= _mm_madd_epi16(xmm3
, xmm0
);
1061 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
1062 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
1064 DATA16_RESULT(xmm6
);
1069 xmm3
= _mm_slli_si128(xmm3
, 2);
1070 xmm3
= _mm_insert_epi16(xmm3
, curr
, 0);
1072 xmm6
= _mm_madd_epi16(xmm3
, xmm0
);
1073 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 8));
1074 xmm6
= _mm_add_epi32(xmm6
, _mm_srli_si128(xmm6
, 4));
1076 DATA16_RESULT(xmm6
);
1083 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1085 #endif /* FLAC__SSE2_SUPPORTED */
1086 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1087 #endif /* FLAC__NO_ASM */
1088 #endif /* FLAC__INTEGER_ONLY_LIBRARY */