lpc_intrin* : Remove unused code.
[flac.git] / src / libFLAC / lpc_intrin_sse2.c
blobd30924815d7b1b0952ab5f19521d7f6d24e61620
1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2013 Xiph.Org Foundation
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if (defined FLAC__CPU_IA32 || defined FLAC__CPU_X86_64) && defined FLAC__HAS_X86INTRIN
40 #include "private/lpc.h"
41 #ifdef FLAC__SSE2_SUPPORTED
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
46 #include <emmintrin.h> /* SSE2 */
48 #define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
49 #define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
51 #define RESIDUAL32_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
52 #define DATA32_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
54 FLAC__SSE_TARGET("sse2")
55 void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
57 int i;
58 FLAC__int32 sum;
59 __m128i cnt = _mm_cvtsi32_si128(lp_quantization);
61 FLAC__ASSERT(order > 0);
62 FLAC__ASSERT(order <= 32);
64 if(order <= 12) {
65 if(order > 8) {
66 if(order > 10) {
67 if(order == 12) {
68 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11;
69 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
70 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
71 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
72 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
73 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
74 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
75 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
76 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
77 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
78 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
79 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
80 q11 = _mm_cvtsi32_si128(0xffff & qlp_coeff[11]); q11 = _mm_shuffle_epi32(q11, _MM_SHUFFLE(0,0,0,0));
82 for(i = 0; i < (int)data_len-3; i+=4) {
83 __m128i summ, mull;
84 summ = _mm_madd_epi16(q11, _mm_loadu_si128((const __m128i*)(data+i-12)));
85 mull = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11))); summ = _mm_add_epi32(summ, mull);
86 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
87 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
88 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
89 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
90 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
91 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
92 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
93 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
94 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
95 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
96 summ = _mm_sra_epi32(summ, cnt);
97 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
100 else { /* order == 11 */
101 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10;
102 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
103 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
104 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
105 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
106 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
107 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
108 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
109 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
110 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
111 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
112 q10 = _mm_cvtsi32_si128(0xffff & qlp_coeff[10]); q10 = _mm_shuffle_epi32(q10, _MM_SHUFFLE(0,0,0,0));
114 for(i = 0; i < (int)data_len-3; i+=4) {
115 __m128i summ, mull;
116 summ = _mm_madd_epi16(q10, _mm_loadu_si128((const __m128i*)(data+i-11)));
117 mull = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10))); summ = _mm_add_epi32(summ, mull);
118 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
119 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
120 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
121 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
122 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
123 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
124 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
125 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
126 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
127 summ = _mm_sra_epi32(summ, cnt);
128 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
132 else {
133 if(order == 10) {
134 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8, q9;
135 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
136 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
137 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
138 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
139 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
140 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
141 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
142 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
143 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
144 q9 = _mm_cvtsi32_si128(0xffff & qlp_coeff[9]); q9 = _mm_shuffle_epi32(q9, _MM_SHUFFLE(0,0,0,0));
146 for(i = 0; i < (int)data_len-3; i+=4) {
147 __m128i summ, mull;
148 summ = _mm_madd_epi16(q9, _mm_loadu_si128((const __m128i*)(data+i-10)));
149 mull = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9))); summ = _mm_add_epi32(summ, mull);
150 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
151 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
152 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
153 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
154 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
155 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
156 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
157 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
158 summ = _mm_sra_epi32(summ, cnt);
159 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
162 else { /* order == 9 */
163 __m128i q0, q1, q2, q3, q4, q5, q6, q7, q8;
164 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
165 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
166 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
167 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
168 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
169 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
170 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
171 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
172 q8 = _mm_cvtsi32_si128(0xffff & qlp_coeff[8]); q8 = _mm_shuffle_epi32(q8, _MM_SHUFFLE(0,0,0,0));
174 for(i = 0; i < (int)data_len-3; i+=4) {
175 __m128i summ, mull;
176 summ = _mm_madd_epi16(q8, _mm_loadu_si128((const __m128i*)(data+i-9)));
177 mull = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8))); summ = _mm_add_epi32(summ, mull);
178 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
179 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
180 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
181 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
182 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
183 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
184 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
185 summ = _mm_sra_epi32(summ, cnt);
186 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
191 else if(order > 4) {
192 if(order > 6) {
193 if(order == 8) {
194 __m128i q0, q1, q2, q3, q4, q5, q6, q7;
195 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
196 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
197 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
198 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
199 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
200 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
201 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
202 q7 = _mm_cvtsi32_si128(0xffff & qlp_coeff[7]); q7 = _mm_shuffle_epi32(q7, _MM_SHUFFLE(0,0,0,0));
204 for(i = 0; i < (int)data_len-3; i+=4) {
205 __m128i summ, mull;
206 summ = _mm_madd_epi16(q7, _mm_loadu_si128((const __m128i*)(data+i-8)));
207 mull = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7))); summ = _mm_add_epi32(summ, mull);
208 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
209 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
210 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
211 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
212 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
213 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
214 summ = _mm_sra_epi32(summ, cnt);
215 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
218 else { /* order == 7 */
219 __m128i q0, q1, q2, q3, q4, q5, q6;
220 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
221 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
222 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
223 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
224 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
225 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
226 q6 = _mm_cvtsi32_si128(0xffff & qlp_coeff[6]); q6 = _mm_shuffle_epi32(q6, _MM_SHUFFLE(0,0,0,0));
228 for(i = 0; i < (int)data_len-3; i+=4) {
229 __m128i summ, mull;
230 summ = _mm_madd_epi16(q6, _mm_loadu_si128((const __m128i*)(data+i-7)));
231 mull = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6))); summ = _mm_add_epi32(summ, mull);
232 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
233 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
234 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
235 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
236 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
237 summ = _mm_sra_epi32(summ, cnt);
238 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
242 else {
243 if(order == 6) {
244 __m128i q0, q1, q2, q3, q4, q5;
245 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
246 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
247 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
248 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
249 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
250 q5 = _mm_cvtsi32_si128(0xffff & qlp_coeff[5]); q5 = _mm_shuffle_epi32(q5, _MM_SHUFFLE(0,0,0,0));
252 for(i = 0; i < (int)data_len-3; i+=4) {
253 __m128i summ, mull;
254 summ = _mm_madd_epi16(q5, _mm_loadu_si128((const __m128i*)(data+i-6)));
255 mull = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5))); summ = _mm_add_epi32(summ, mull);
256 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
257 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
258 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
259 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
260 summ = _mm_sra_epi32(summ, cnt);
261 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
264 else { /* order == 5 */
265 __m128i q0, q1, q2, q3, q4;
266 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
267 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
268 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
269 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
270 q4 = _mm_cvtsi32_si128(0xffff & qlp_coeff[4]); q4 = _mm_shuffle_epi32(q4, _MM_SHUFFLE(0,0,0,0));
272 for(i = 0; i < (int)data_len-3; i+=4) {
273 __m128i summ, mull;
274 summ = _mm_madd_epi16(q4, _mm_loadu_si128((const __m128i*)(data+i-5)));
275 mull = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4))); summ = _mm_add_epi32(summ, mull);
276 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
277 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
278 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
279 summ = _mm_sra_epi32(summ, cnt);
280 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
285 else {
286 if(order > 2) {
287 if(order == 4) {
288 __m128i q0, q1, q2, q3;
289 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
290 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
291 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
292 q3 = _mm_cvtsi32_si128(0xffff & qlp_coeff[3]); q3 = _mm_shuffle_epi32(q3, _MM_SHUFFLE(0,0,0,0));
294 for(i = 0; i < (int)data_len-3; i+=4) {
295 __m128i summ, mull;
296 summ = _mm_madd_epi16(q3, _mm_loadu_si128((const __m128i*)(data+i-4)));
297 mull = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3))); summ = _mm_add_epi32(summ, mull);
298 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
299 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
300 summ = _mm_sra_epi32(summ, cnt);
301 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
304 else { /* order == 3 */
305 __m128i q0, q1, q2;
306 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
307 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
308 q2 = _mm_cvtsi32_si128(0xffff & qlp_coeff[2]); q2 = _mm_shuffle_epi32(q2, _MM_SHUFFLE(0,0,0,0));
310 for(i = 0; i < (int)data_len-3; i+=4) {
311 __m128i summ, mull;
312 summ = _mm_madd_epi16(q2, _mm_loadu_si128((const __m128i*)(data+i-3)));
313 mull = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2))); summ = _mm_add_epi32(summ, mull);
314 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
315 summ = _mm_sra_epi32(summ, cnt);
316 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
320 else {
321 if(order == 2) {
322 __m128i q0, q1;
323 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
324 q1 = _mm_cvtsi32_si128(0xffff & qlp_coeff[1]); q1 = _mm_shuffle_epi32(q1, _MM_SHUFFLE(0,0,0,0));
326 for(i = 0; i < (int)data_len-3; i+=4) {
327 __m128i summ, mull;
328 summ = _mm_madd_epi16(q1, _mm_loadu_si128((const __m128i*)(data+i-2)));
329 mull = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1))); summ = _mm_add_epi32(summ, mull);
330 summ = _mm_sra_epi32(summ, cnt);
331 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
334 else { /* order == 1 */
335 __m128i q0;
336 q0 = _mm_cvtsi32_si128(0xffff & qlp_coeff[0]); q0 = _mm_shuffle_epi32(q0, _MM_SHUFFLE(0,0,0,0));
338 for(i = 0; i < (int)data_len-3; i+=4) {
339 __m128i summ;
340 summ = _mm_madd_epi16(q0, _mm_loadu_si128((const __m128i*)(data+i-1)));
341 summ = _mm_sra_epi32(summ, cnt);
342 _mm_storeu_si128((__m128i*)(residual+i), _mm_sub_epi32(_mm_loadu_si128((const __m128i*)(data+i)), summ));
347 for(; i < (int)data_len; i++) {
348 sum = 0;
349 switch(order) {
350 case 12: sum += qlp_coeff[11] * data[i-12];
351 case 11: sum += qlp_coeff[10] * data[i-11];
352 case 10: sum += qlp_coeff[ 9] * data[i-10];
353 case 9: sum += qlp_coeff[ 8] * data[i- 9];
354 case 8: sum += qlp_coeff[ 7] * data[i- 8];
355 case 7: sum += qlp_coeff[ 6] * data[i- 7];
356 case 6: sum += qlp_coeff[ 5] * data[i- 6];
357 case 5: sum += qlp_coeff[ 4] * data[i- 5];
358 case 4: sum += qlp_coeff[ 3] * data[i- 4];
359 case 3: sum += qlp_coeff[ 2] * data[i- 3];
360 case 2: sum += qlp_coeff[ 1] * data[i- 2];
361 case 1: sum += qlp_coeff[ 0] * data[i- 1];
363 residual[i] = data[i] - (sum >> lp_quantization);
366 else { /* order > 12 */
367 for(i = 0; i < (int)data_len; i++) {
368 sum = 0;
369 switch(order) {
370 case 32: sum += qlp_coeff[31] * data[i-32];
371 case 31: sum += qlp_coeff[30] * data[i-31];
372 case 30: sum += qlp_coeff[29] * data[i-30];
373 case 29: sum += qlp_coeff[28] * data[i-29];
374 case 28: sum += qlp_coeff[27] * data[i-28];
375 case 27: sum += qlp_coeff[26] * data[i-27];
376 case 26: sum += qlp_coeff[25] * data[i-26];
377 case 25: sum += qlp_coeff[24] * data[i-25];
378 case 24: sum += qlp_coeff[23] * data[i-24];
379 case 23: sum += qlp_coeff[22] * data[i-23];
380 case 22: sum += qlp_coeff[21] * data[i-22];
381 case 21: sum += qlp_coeff[20] * data[i-21];
382 case 20: sum += qlp_coeff[19] * data[i-20];
383 case 19: sum += qlp_coeff[18] * data[i-19];
384 case 18: sum += qlp_coeff[17] * data[i-18];
385 case 17: sum += qlp_coeff[16] * data[i-17];
386 case 16: sum += qlp_coeff[15] * data[i-16];
387 case 15: sum += qlp_coeff[14] * data[i-15];
388 case 14: sum += qlp_coeff[13] * data[i-14];
389 case 13: sum += qlp_coeff[12] * data[i-13];
390 sum += qlp_coeff[11] * data[i-12];
391 sum += qlp_coeff[10] * data[i-11];
392 sum += qlp_coeff[ 9] * data[i-10];
393 sum += qlp_coeff[ 8] * data[i- 9];
394 sum += qlp_coeff[ 7] * data[i- 8];
395 sum += qlp_coeff[ 6] * data[i- 7];
396 sum += qlp_coeff[ 5] * data[i- 6];
397 sum += qlp_coeff[ 4] * data[i- 5];
398 sum += qlp_coeff[ 3] * data[i- 4];
399 sum += qlp_coeff[ 2] * data[i- 3];
400 sum += qlp_coeff[ 1] * data[i- 2];
401 sum += qlp_coeff[ 0] * data[i- 1];
403 residual[i] = data[i] - (sum >> lp_quantization);
408 FLAC__SSE_TARGET("sse2")
409 void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
411 int i;
413 FLAC__ASSERT(order > 0);
414 FLAC__ASSERT(order <= 32);
416 if(order <= 12) {
417 if(order > 8) { /* order == 9, 10, 11, 12 */
418 if(order > 10) { /* order == 11, 12 */
419 if(order == 12) {
420 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
421 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0)); // 0 0 q[1] q[0]
422 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2)); // 0 0 q[3] q[2]
423 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4)); // 0 0 q[5] q[4]
424 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6)); // 0 0 q[7] q[6]
425 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8)); // 0 0 q[9] q[8]
426 xmm5 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+10)); // 0 0 q[11] q[10]
428 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0)); // 0 q[1] 0 q[0]
429 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0)); // 0 q[3] 0 q[2]
430 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0)); // 0 q[5] 0 q[4]
431 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0)); // 0 q[7] 0 q[6]
432 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0)); // 0 q[9] 0 q[8]
433 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(3,1,2,0)); // 0 q[11] 0 q[10]
435 for(i = 0; i < (int)data_len; i++) {
436 //sum = 0;
437 //sum += qlp_coeff[11] * data[i-12];
438 //sum += qlp_coeff[10] * data[i-11];
439 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-12)); // 0 0 d[i-11] d[i-12]
440 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1)); // 0 d[i-12] 0 d[i-11]
441 xmm7 = _mm_mul_epu32(xmm7, xmm5); /* we use _unsigned_ multiplication and discard high dword of the result values */
443 //sum += qlp_coeff[9] * data[i-10];
444 //sum += qlp_coeff[8] * data[i-9];
445 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
446 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
447 xmm6 = _mm_mul_epu32(xmm6, xmm4);
448 xmm7 = _mm_add_epi32(xmm7, xmm6);
450 //sum += qlp_coeff[7] * data[i-8];
451 //sum += qlp_coeff[6] * data[i-7];
452 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
453 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
454 xmm6 = _mm_mul_epu32(xmm6, xmm3);
455 xmm7 = _mm_add_epi32(xmm7, xmm6);
457 //sum += qlp_coeff[5] * data[i-6];
458 //sum += qlp_coeff[4] * data[i-5];
459 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
460 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
461 xmm6 = _mm_mul_epu32(xmm6, xmm2);
462 xmm7 = _mm_add_epi32(xmm7, xmm6);
464 //sum += qlp_coeff[3] * data[i-4];
465 //sum += qlp_coeff[2] * data[i-3];
466 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
467 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
468 xmm6 = _mm_mul_epu32(xmm6, xmm1);
469 xmm7 = _mm_add_epi32(xmm7, xmm6);
471 //sum += qlp_coeff[1] * data[i-2];
472 //sum += qlp_coeff[0] * data[i-1];
473 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
474 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
475 xmm6 = _mm_mul_epu32(xmm6, xmm0);
476 xmm7 = _mm_add_epi32(xmm7, xmm6);
478 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
479 RESIDUAL32_RESULT(xmm7);
482 else { /* order == 11 */
483 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
484 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
485 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
486 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
487 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
488 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
489 xmm5 = _mm_cvtsi32_si128(qlp_coeff[10]);
491 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
492 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
493 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
494 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
495 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
497 for(i = 0; i < (int)data_len; i++) {
498 //sum = 0;
499 //sum = qlp_coeff[10] * data[i-11];
500 xmm7 = _mm_cvtsi32_si128(data[i-11]);
501 xmm7 = _mm_mul_epu32(xmm7, xmm5);
503 //sum += qlp_coeff[9] * data[i-10];
504 //sum += qlp_coeff[8] * data[i-9];
505 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-10));
506 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
507 xmm6 = _mm_mul_epu32(xmm6, xmm4);
508 xmm7 = _mm_add_epi32(xmm7, xmm6);
510 //sum += qlp_coeff[7] * data[i-8];
511 //sum += qlp_coeff[6] * data[i-7];
512 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
513 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
514 xmm6 = _mm_mul_epu32(xmm6, xmm3);
515 xmm7 = _mm_add_epi32(xmm7, xmm6);
517 //sum += qlp_coeff[5] * data[i-6];
518 //sum += qlp_coeff[4] * data[i-5];
519 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
520 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
521 xmm6 = _mm_mul_epu32(xmm6, xmm2);
522 xmm7 = _mm_add_epi32(xmm7, xmm6);
524 //sum += qlp_coeff[3] * data[i-4];
525 //sum += qlp_coeff[2] * data[i-3];
526 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
527 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
528 xmm6 = _mm_mul_epu32(xmm6, xmm1);
529 xmm7 = _mm_add_epi32(xmm7, xmm6);
531 //sum += qlp_coeff[1] * data[i-2];
532 //sum += qlp_coeff[0] * data[i-1];
533 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
534 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
535 xmm6 = _mm_mul_epu32(xmm6, xmm0);
536 xmm7 = _mm_add_epi32(xmm7, xmm6);
538 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
539 RESIDUAL32_RESULT(xmm7);
543 else { /* order == 9, 10 */
544 if(order == 10) {
545 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
546 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
547 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
548 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
549 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
550 xmm4 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+8));
552 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
553 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
554 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
555 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
556 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(3,1,2,0));
558 for(i = 0; i < (int)data_len; i++) {
559 //sum = 0;
560 //sum += qlp_coeff[9] * data[i-10];
561 //sum += qlp_coeff[8] * data[i-9];
562 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-10));
563 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
564 xmm7 = _mm_mul_epu32(xmm7, xmm4);
566 //sum += qlp_coeff[7] * data[i-8];
567 //sum += qlp_coeff[6] * data[i-7];
568 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
569 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
570 xmm6 = _mm_mul_epu32(xmm6, xmm3);
571 xmm7 = _mm_add_epi32(xmm7, xmm6);
573 //sum += qlp_coeff[5] * data[i-6];
574 //sum += qlp_coeff[4] * data[i-5];
575 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
576 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
577 xmm6 = _mm_mul_epu32(xmm6, xmm2);
578 xmm7 = _mm_add_epi32(xmm7, xmm6);
580 //sum += qlp_coeff[3] * data[i-4];
581 //sum += qlp_coeff[2] * data[i-3];
582 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
583 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
584 xmm6 = _mm_mul_epu32(xmm6, xmm1);
585 xmm7 = _mm_add_epi32(xmm7, xmm6);
587 //sum += qlp_coeff[1] * data[i-2];
588 //sum += qlp_coeff[0] * data[i-1];
589 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
590 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
591 xmm6 = _mm_mul_epu32(xmm6, xmm0);
592 xmm7 = _mm_add_epi32(xmm7, xmm6);
594 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
595 RESIDUAL32_RESULT(xmm7);
598 else { /* order == 9 */
599 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm6, xmm7;
600 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
601 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
602 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
603 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
604 xmm4 = _mm_cvtsi32_si128(qlp_coeff[8]);
606 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
607 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
608 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
609 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
611 for(i = 0; i < (int)data_len; i++) {
612 //sum = 0;
613 //sum = qlp_coeff[8] * data[i-9];
614 xmm7 = _mm_cvtsi32_si128(data[i-9]);
615 xmm7 = _mm_mul_epu32(xmm7, xmm4);
617 //sum += qlp_coeff[7] * data[i-8];
618 //sum += qlp_coeff[6] * data[i-7];
619 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-8));
620 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
621 xmm6 = _mm_mul_epu32(xmm6, xmm3);
622 xmm7 = _mm_add_epi32(xmm7, xmm6);
624 //sum += qlp_coeff[5] * data[i-6];
625 //sum += qlp_coeff[4] * data[i-5];
626 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
627 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
628 xmm6 = _mm_mul_epu32(xmm6, xmm2);
629 xmm7 = _mm_add_epi32(xmm7, xmm6);
631 //sum += qlp_coeff[3] * data[i-4];
632 //sum += qlp_coeff[2] * data[i-3];
633 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
634 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
635 xmm6 = _mm_mul_epu32(xmm6, xmm1);
636 xmm7 = _mm_add_epi32(xmm7, xmm6);
638 //sum += qlp_coeff[1] * data[i-2];
639 //sum += qlp_coeff[0] * data[i-1];
640 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
641 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
642 xmm6 = _mm_mul_epu32(xmm6, xmm0);
643 xmm7 = _mm_add_epi32(xmm7, xmm6);
645 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
646 RESIDUAL32_RESULT(xmm7);
651 else if(order > 4) { /* order == 5, 6, 7, 8 */
652 if(order > 6) { /* order == 7, 8 */
653 if(order == 8) {
654 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
655 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
656 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
657 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
658 xmm3 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+6));
660 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
661 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
662 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
663 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(3,1,2,0));
665 for(i = 0; i < (int)data_len; i++) {
666 //sum = 0;
667 //sum += qlp_coeff[7] * data[i-8];
668 //sum += qlp_coeff[6] * data[i-7];
669 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-8));
670 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
671 xmm7 = _mm_mul_epu32(xmm7, xmm3);
673 //sum += qlp_coeff[5] * data[i-6];
674 //sum += qlp_coeff[4] * data[i-5];
675 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
676 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
677 xmm6 = _mm_mul_epu32(xmm6, xmm2);
678 xmm7 = _mm_add_epi32(xmm7, xmm6);
680 //sum += qlp_coeff[3] * data[i-4];
681 //sum += qlp_coeff[2] * data[i-3];
682 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
683 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
684 xmm6 = _mm_mul_epu32(xmm6, xmm1);
685 xmm7 = _mm_add_epi32(xmm7, xmm6);
687 //sum += qlp_coeff[1] * data[i-2];
688 //sum += qlp_coeff[0] * data[i-1];
689 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
690 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
691 xmm6 = _mm_mul_epu32(xmm6, xmm0);
692 xmm7 = _mm_add_epi32(xmm7, xmm6);
694 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
695 RESIDUAL32_RESULT(xmm7);
698 else { /* order == 7 */
699 __m128i xmm0, xmm1, xmm2, xmm3, xmm6, xmm7;
700 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
701 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
702 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
703 xmm3 = _mm_cvtsi32_si128(qlp_coeff[6]);
705 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
706 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
707 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
709 for(i = 0; i < (int)data_len; i++) {
710 //sum = 0;
711 //sum = qlp_coeff[6] * data[i-7];
712 xmm7 = _mm_cvtsi32_si128(data[i-7]);
713 xmm7 = _mm_mul_epu32(xmm7, xmm3);
715 //sum += qlp_coeff[5] * data[i-6];
716 //sum += qlp_coeff[4] * data[i-5];
717 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-6));
718 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
719 xmm6 = _mm_mul_epu32(xmm6, xmm2);
720 xmm7 = _mm_add_epi32(xmm7, xmm6);
722 //sum += qlp_coeff[3] * data[i-4];
723 //sum += qlp_coeff[2] * data[i-3];
724 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
725 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
726 xmm6 = _mm_mul_epu32(xmm6, xmm1);
727 xmm7 = _mm_add_epi32(xmm7, xmm6);
729 //sum += qlp_coeff[1] * data[i-2];
730 //sum += qlp_coeff[0] * data[i-1];
731 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
732 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
733 xmm6 = _mm_mul_epu32(xmm6, xmm0);
734 xmm7 = _mm_add_epi32(xmm7, xmm6);
736 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
737 RESIDUAL32_RESULT(xmm7);
741 else { /* order == 5, 6 */
742 if(order == 6) {
743 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
744 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
745 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
746 xmm2 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+4));
748 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
749 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
750 xmm2 = _mm_shuffle_epi32(xmm2, _MM_SHUFFLE(3,1,2,0));
752 for(i = 0; i < (int)data_len; i++) {
753 //sum = 0;
754 //sum += qlp_coeff[5] * data[i-6];
755 //sum += qlp_coeff[4] * data[i-5];
756 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-6));
757 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
758 xmm7 = _mm_mul_epu32(xmm7, xmm2);
760 //sum += qlp_coeff[3] * data[i-4];
761 //sum += qlp_coeff[2] * data[i-3];
762 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
763 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
764 xmm6 = _mm_mul_epu32(xmm6, xmm1);
765 xmm7 = _mm_add_epi32(xmm7, xmm6);
767 //sum += qlp_coeff[1] * data[i-2];
768 //sum += qlp_coeff[0] * data[i-1];
769 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
770 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
771 xmm6 = _mm_mul_epu32(xmm6, xmm0);
772 xmm7 = _mm_add_epi32(xmm7, xmm6);
774 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
775 RESIDUAL32_RESULT(xmm7);
778 else { /* order == 5 */
779 __m128i xmm0, xmm1, xmm2, xmm6, xmm7;
780 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
781 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
782 xmm2 = _mm_cvtsi32_si128(qlp_coeff[4]);
784 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
785 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
787 for(i = 0; i < (int)data_len; i++) {
788 //sum = 0;
789 //sum = qlp_coeff[4] * data[i-5];
790 xmm7 = _mm_cvtsi32_si128(data[i-5]);
791 xmm7 = _mm_mul_epu32(xmm7, xmm2);
793 //sum += qlp_coeff[3] * data[i-4];
794 //sum += qlp_coeff[2] * data[i-3];
795 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-4));
796 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
797 xmm6 = _mm_mul_epu32(xmm6, xmm1);
798 xmm7 = _mm_add_epi32(xmm7, xmm6);
800 //sum += qlp_coeff[1] * data[i-2];
801 //sum += qlp_coeff[0] * data[i-1];
802 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
803 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
804 xmm6 = _mm_mul_epu32(xmm6, xmm0);
805 xmm7 = _mm_add_epi32(xmm7, xmm6);
807 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
808 RESIDUAL32_RESULT(xmm7);
813 else { /* order == 1, 2, 3, 4 */
814 if(order > 2) { /* order == 3, 4 */
815 if(order == 4) {
816 __m128i xmm0, xmm1, xmm6, xmm7;
817 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
818 xmm1 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+2));
820 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
821 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(3,1,2,0));
823 for(i = 0; i < (int)data_len; i++) {
824 //sum = 0;
825 //sum += qlp_coeff[3] * data[i-4];
826 //sum += qlp_coeff[2] * data[i-3];
827 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-4));
828 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
829 xmm7 = _mm_mul_epu32(xmm7, xmm1);
831 //sum += qlp_coeff[1] * data[i-2];
832 //sum += qlp_coeff[0] * data[i-1];
833 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
834 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
835 xmm6 = _mm_mul_epu32(xmm6, xmm0);
836 xmm7 = _mm_add_epi32(xmm7, xmm6);
838 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
839 RESIDUAL32_RESULT(xmm7);
842 else { /* order == 3 */
843 __m128i xmm0, xmm1, xmm6, xmm7;
844 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
845 xmm1 = _mm_cvtsi32_si128(qlp_coeff[2]);
847 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
849 for(i = 0; i < (int)data_len; i++) {
850 //sum = 0;
851 //sum = qlp_coeff[2] * data[i-3];
852 xmm7 = _mm_cvtsi32_si128(data[i-3]);
853 xmm7 = _mm_mul_epu32(xmm7, xmm1);
855 //sum += qlp_coeff[1] * data[i-2];
856 //sum += qlp_coeff[0] * data[i-1];
857 xmm6 = _mm_loadl_epi64((const __m128i*)(data+i-2));
858 xmm6 = _mm_shuffle_epi32(xmm6, _MM_SHUFFLE(2,0,3,1));
859 xmm6 = _mm_mul_epu32(xmm6, xmm0);
860 xmm7 = _mm_add_epi32(xmm7, xmm6);
862 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
863 RESIDUAL32_RESULT(xmm7);
867 else { /* order == 1, 2 */
868 if(order == 2) {
869 __m128i xmm0, xmm7;
870 xmm0 = _mm_loadl_epi64((const __m128i*)(qlp_coeff+0));
871 xmm0 = _mm_shuffle_epi32(xmm0, _MM_SHUFFLE(3,1,2,0));
873 for(i = 0; i < (int)data_len; i++) {
874 //sum = 0;
875 //sum += qlp_coeff[1] * data[i-2];
876 //sum += qlp_coeff[0] * data[i-1];
877 xmm7 = _mm_loadl_epi64((const __m128i*)(data+i-2));
878 xmm7 = _mm_shuffle_epi32(xmm7, _MM_SHUFFLE(2,0,3,1));
879 xmm7 = _mm_mul_epu32(xmm7, xmm0);
881 xmm7 = _mm_add_epi32(xmm7, _mm_srli_si128(xmm7, 8));
882 RESIDUAL32_RESULT(xmm7);
885 else { /* order == 1 */
886 for(i = 0; i < (int)data_len; i++)
887 residual[i] = data[i] - ((qlp_coeff[0] * data[i-1]) >> lp_quantization);
892 else { /* order > 12 */
893 FLAC__int32 sum;
894 for(i = 0; i < (int)data_len; i++) {
895 sum = 0;
896 switch(order) {
897 case 32: sum += qlp_coeff[31] * data[i-32];
898 case 31: sum += qlp_coeff[30] * data[i-31];
899 case 30: sum += qlp_coeff[29] * data[i-30];
900 case 29: sum += qlp_coeff[28] * data[i-29];
901 case 28: sum += qlp_coeff[27] * data[i-28];
902 case 27: sum += qlp_coeff[26] * data[i-27];
903 case 26: sum += qlp_coeff[25] * data[i-26];
904 case 25: sum += qlp_coeff[24] * data[i-25];
905 case 24: sum += qlp_coeff[23] * data[i-24];
906 case 23: sum += qlp_coeff[22] * data[i-23];
907 case 22: sum += qlp_coeff[21] * data[i-22];
908 case 21: sum += qlp_coeff[20] * data[i-21];
909 case 20: sum += qlp_coeff[19] * data[i-20];
910 case 19: sum += qlp_coeff[18] * data[i-19];
911 case 18: sum += qlp_coeff[17] * data[i-18];
912 case 17: sum += qlp_coeff[16] * data[i-17];
913 case 16: sum += qlp_coeff[15] * data[i-16];
914 case 15: sum += qlp_coeff[14] * data[i-15];
915 case 14: sum += qlp_coeff[13] * data[i-14];
916 case 13: sum += qlp_coeff[12] * data[i-13];
917 sum += qlp_coeff[11] * data[i-12];
918 sum += qlp_coeff[10] * data[i-11];
919 sum += qlp_coeff[ 9] * data[i-10];
920 sum += qlp_coeff[ 8] * data[i- 9];
921 sum += qlp_coeff[ 7] * data[i- 8];
922 sum += qlp_coeff[ 6] * data[i- 7];
923 sum += qlp_coeff[ 5] * data[i- 6];
924 sum += qlp_coeff[ 4] * data[i- 5];
925 sum += qlp_coeff[ 3] * data[i- 4];
926 sum += qlp_coeff[ 2] * data[i- 3];
927 sum += qlp_coeff[ 1] * data[i- 2];
928 sum += qlp_coeff[ 0] * data[i- 1];
930 residual[i] = data[i] - (sum >> lp_quantization);
935 #if defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM /* unused for x64; not better than MMX asm */
937 FLAC__SSE_TARGET("sse2")
938 void FLAC__lpc_restore_signal_16_intrin_sse2(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[])
940 if (order < 8 || order > 12) {
941 FLAC__lpc_restore_signal(residual, data_len, qlp_coeff, order, lp_quantization, data);
942 return;
944 if (data_len == 0)
945 return;
947 FLAC__ASSERT(order >= 8);
948 FLAC__ASSERT(order <= 12);
950 if(order > 8) { /* order == 9, 10, 11, 12 */
951 FLAC__int32 curr;
952 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
953 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
954 xmm6 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
955 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+8)); /* read 0 to 3 uninitialized coeffs... */
956 switch(order) /* ...and zero them out */
958 case 9:
959 xmm1 = _mm_slli_si128(xmm1, 12); xmm1 = _mm_srli_si128(xmm1, 12); break;
960 case 10:
961 xmm1 = _mm_slli_si128(xmm1, 8); xmm1 = _mm_srli_si128(xmm1, 8); break;
962 case 11:
963 xmm1 = _mm_slli_si128(xmm1, 4); xmm1 = _mm_srli_si128(xmm1, 4); break;
965 xmm2 = _mm_setzero_si128();
966 xmm0 = _mm_packs_epi32(xmm0, xmm6);
967 xmm1 = _mm_packs_epi32(xmm1, xmm2);
969 xmm4 = _mm_loadu_si128((const __m128i*)(data-12));
970 xmm5 = _mm_loadu_si128((const __m128i*)(data-8));
971 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
972 xmm4 = _mm_shuffle_epi32(xmm4, _MM_SHUFFLE(0,1,2,3));
973 xmm5 = _mm_shuffle_epi32(xmm5, _MM_SHUFFLE(0,1,2,3));
974 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
975 xmm4 = _mm_packs_epi32(xmm4, xmm2);
976 xmm3 = _mm_packs_epi32(xmm3, xmm5);
978 xmm7 = _mm_slli_si128(xmm1, 2);
979 xmm7 = _mm_or_si128(xmm7, _mm_srli_si128(xmm0, 14));
980 xmm2 = _mm_slli_si128(xmm0, 2);
982 /* xmm0, xmm1: qlp_coeff
983 xmm2, xmm7: qlp_coeff << 16 bit
984 xmm3, xmm4: data */
986 xmm5 = _mm_madd_epi16(xmm4, xmm1);
987 xmm6 = _mm_madd_epi16(xmm3, xmm0);
988 xmm6 = _mm_add_epi32(xmm6, xmm5);
989 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
990 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
992 DATA16_RESULT(xmm6);
994 data_len--;
996 if(data_len % 2) {
997 xmm6 = _mm_srli_si128(xmm3, 14);
998 xmm4 = _mm_slli_si128(xmm4, 2);
999 xmm3 = _mm_slli_si128(xmm3, 2);
1000 xmm4 = _mm_or_si128(xmm4, xmm6);
1001 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1003 xmm5 = _mm_madd_epi16(xmm4, xmm1);
1004 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1005 xmm6 = _mm_add_epi32(xmm6, xmm5);
1006 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1007 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1009 DATA16_RESULT(xmm6);
1011 data_len--;
1014 while(data_len) { /* data_len is a multiple of 2 */
1015 /* 1 _mm_slli_si128 per data element less but we need shifted qlp_coeff in xmm2:xmm7 */
1016 xmm6 = _mm_srli_si128(xmm3, 12);
1017 xmm4 = _mm_slli_si128(xmm4, 4);
1018 xmm3 = _mm_slli_si128(xmm3, 4);
1019 xmm4 = _mm_or_si128(xmm4, xmm6);
1020 xmm3 = _mm_insert_epi16(xmm3, curr, 1);
1022 xmm5 = _mm_madd_epi16(xmm4, xmm7);
1023 xmm6 = _mm_madd_epi16(xmm3, xmm2);
1024 xmm6 = _mm_add_epi32(xmm6, xmm5);
1025 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1026 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1028 DATA16_RESULT(xmm6);
1030 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1032 xmm5 = _mm_madd_epi16(xmm4, xmm1);
1033 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1034 xmm6 = _mm_add_epi32(xmm6, xmm5);
1035 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1036 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1038 DATA16_RESULT(xmm6);
1040 data_len-=2;
1042 } /* endif(order > 8) */
1043 else
1045 FLAC__int32 curr;
1046 __m128i xmm0, xmm1, xmm3, xmm6;
1047 xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
1048 xmm1 = _mm_loadu_si128((const __m128i*)(qlp_coeff+4));
1049 xmm0 = _mm_packs_epi32(xmm0, xmm1);
1051 xmm1 = _mm_loadu_si128((const __m128i*)(data-8));
1052 xmm3 = _mm_loadu_si128((const __m128i*)(data-4));
1053 xmm1 = _mm_shuffle_epi32(xmm1, _MM_SHUFFLE(0,1,2,3));
1054 xmm3 = _mm_shuffle_epi32(xmm3, _MM_SHUFFLE(0,1,2,3));
1055 xmm3 = _mm_packs_epi32(xmm3, xmm1);
1057 /* xmm0: qlp_coeff
1058 xmm3: data */
1060 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1061 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1062 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1064 DATA16_RESULT(xmm6);
1066 data_len--;
1068 while(data_len) {
1069 xmm3 = _mm_slli_si128(xmm3, 2);
1070 xmm3 = _mm_insert_epi16(xmm3, curr, 0);
1072 xmm6 = _mm_madd_epi16(xmm3, xmm0);
1073 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
1074 xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
1076 DATA16_RESULT(xmm6);
1078 data_len--;
1083 #endif /* defined FLAC__CPU_IA32 && !defined FLAC__HAS_NASM */
1085 #endif /* FLAC__SSE2_SUPPORTED */
1086 #endif /* (FLAC__CPU_IA32 || FLAC__CPU_X86_64) && FLAC__HAS_X86INTRIN */
1087 #endif /* FLAC__NO_ASM */
1088 #endif /* FLAC__INTEGER_ONLY_LIBRARY */