liba52 asm optimizations ported to amd64
[mplayer.git] / liba52 / resample_mmx.c
blob799b2e368359df7ba38f3bed7b70ee88de7c7e7e
2 // MMX optimizations from Michael Niedermayer (michaelni@gmx.at) (under GPL)
4 /* optimization TODO / NOTES
5 movntq is slightly faster (0.5% with the current test.c benchmark)
6 (but thats just test.c so that needs to be testd in reallity)
7 and it would mean (C / MMX2 / MMX / 3DNOW) versions
8 */
10 #include "a52_internal.h"
13 static uint64_t attribute_used __attribute__((aligned(8))) magicF2W= 0x43c0000043c00000LL;
14 static uint64_t attribute_used __attribute__((aligned(8))) wm1010= 0xFFFF0000FFFF0000LL;
15 static uint64_t attribute_used __attribute__((aligned(8))) wm0101= 0x0000FFFF0000FFFFLL;
16 static uint64_t attribute_used __attribute__((aligned(8))) wm1100= 0xFFFFFFFF00000000LL;
18 static int a52_resample_MONO_to_5_MMX(float * _f, int16_t * s16){
19 int32_t * f = (int32_t *) _f;
20 asm volatile(
21 "mov $-512, %%"REG_S" \n\t"
22 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
23 "movq "MANGLE(wm1100)", %%mm3 \n\t"
24 "movq "MANGLE(wm0101)", %%mm4 \n\t"
25 "movq "MANGLE(wm1010)", %%mm5 \n\t"
26 "pxor %%mm6, %%mm6 \n\t"
27 "1: \n\t"
28 "movq (%1, %%"REG_S", 2), %%mm0 \n\t"
29 "movq 8(%1, %%"REG_S", 2), %%mm1\n\t"
30 "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t"
31 "psubd %%mm7, %%mm0 \n\t"
32 "psubd %%mm7, %%mm1 \n\t"
33 "packssdw %%mm1, %%mm0 \n\t"
34 "movq %%mm0, %%mm1 \n\t"
35 "pand %%mm4, %%mm0 \n\t"
36 "pand %%mm5, %%mm1 \n\t"
37 "movq %%mm6, (%0, %%"REG_D") \n\t" // 0 0 0 0
38 "movd %%mm0, 8(%0, %%"REG_D") \n\t" // A 0
39 "pand %%mm3, %%mm0 \n\t"
40 "movd %%mm6, 12(%0, %%"REG_D") \n\t" // 0 0
41 "movd %%mm1, 16(%0, %%"REG_D") \n\t" // 0 B
42 "pand %%mm3, %%mm1 \n\t"
43 "movd %%mm6, 20(%0, %%"REG_D") \n\t" // 0 0
44 "movq %%mm0, 24(%0, %%"REG_D") \n\t" // 0 0 C 0
45 "movq %%mm1, 32(%0, %%"REG_D") \n\t" // 0 0 0 B
46 "add $8, %%"REG_S" \n\t"
47 " jnz 1b \n\t"
48 "emms \n\t"
49 :: "r" (s16+1280), "r" (f+256)
50 :"%"REG_S, "%"REG_D, "memory"
52 return 5*256;
55 static int a52_resample_STEREO_to_2_MMX(float * _f, int16_t * s16){
56 int32_t * f = (int32_t *) _f;
57 /* benchmark scores are 0.3% better with SSE but we would need to set bias=0 and premultiply it
58 #ifdef HAVE_SSE
59 asm volatile(
60 "mov $-1024, %%"REG_S" \n\t"
61 "1: \n\t"
62 "cvtps2pi (%1, %%"REG_S"), %%mm0\n\t"
63 "cvtps2pi 1024(%1, %%"REG_S"), %%mm2\n\t"
64 "movq %%mm0, %%mm1 \n\t"
65 "punpcklwd %%mm2, %%mm0 \n\t"
66 "punpckhwd %%mm2, %%mm1 \n\t"
67 "movq %%mm0, (%0, %%"REG_S") \n\t"
68 "movq %%mm1, 8(%0, %%"REG_S") \n\t"
69 "add $16, %%"REG_S" \n\t"
70 " jnz 1b \n\t"
71 "emms \n\t"
72 :: "r" (s16+512), "r" (f+256)
73 :"%"REG_S, "memory"
74 );*/
75 asm volatile(
76 "mov $-1024, %%"REG_S" \n\t"
77 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
78 "1: \n\t"
79 "movq (%1, %%"REG_S"), %%mm0 \n\t"
80 "movq 8(%1, %%"REG_S"), %%mm1 \n\t"
81 "movq 1024(%1, %%"REG_S"), %%mm2\n\t"
82 "movq 1032(%1, %%"REG_S"), %%mm3\n\t"
83 "psubd %%mm7, %%mm0 \n\t"
84 "psubd %%mm7, %%mm1 \n\t"
85 "psubd %%mm7, %%mm2 \n\t"
86 "psubd %%mm7, %%mm3 \n\t"
87 "packssdw %%mm1, %%mm0 \n\t"
88 "packssdw %%mm3, %%mm2 \n\t"
89 "movq %%mm0, %%mm1 \n\t"
90 "punpcklwd %%mm2, %%mm0 \n\t"
91 "punpckhwd %%mm2, %%mm1 \n\t"
92 "movq %%mm0, (%0, %%"REG_S") \n\t"
93 "movq %%mm1, 8(%0, %%"REG_S") \n\t"
94 "add $16, %%"REG_S" \n\t"
95 " jnz 1b \n\t"
96 "emms \n\t"
97 :: "r" (s16+512), "r" (f+256)
98 :"%"REG_S, "memory"
100 return 2*256;
103 static int a52_resample_3F_to_5_MMX(float * _f, int16_t * s16){
104 int32_t * f = (int32_t *) _f;
105 asm volatile(
106 "mov $-1024, %%"REG_S" \n\t"
107 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
108 "pxor %%mm6, %%mm6 \n\t"
109 "movq %%mm7, %%mm5 \n\t"
110 "punpckldq %%mm6, %%mm5 \n\t"
111 "1: \n\t"
112 "movd (%1, %%"REG_S"), %%mm0 \n\t"
113 "punpckldq 2048(%1, %%"REG_S"), %%mm0\n\t"
114 "movd 1024(%1, %%"REG_S"), %%mm1\n\t"
115 "punpckldq 4(%1, %%"REG_S"), %%mm1\n\t"
116 "movd 2052(%1, %%"REG_S"), %%mm2\n\t"
117 "movq %%mm7, %%mm3 \n\t"
118 "punpckldq 1028(%1, %%"REG_S"), %%mm3\n\t"
119 "movd 8(%1, %%"REG_S"), %%mm4 \n\t"
120 "punpckldq 2056(%1, %%"REG_S"), %%mm4\n\t"
121 "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t"
122 "sar $1, %%"REG_D" \n\t"
123 "psubd %%mm7, %%mm0 \n\t"
124 "psubd %%mm7, %%mm1 \n\t"
125 "psubd %%mm5, %%mm2 \n\t"
126 "psubd %%mm7, %%mm3 \n\t"
127 "psubd %%mm7, %%mm4 \n\t"
128 "packssdw %%mm6, %%mm0 \n\t"
129 "packssdw %%mm2, %%mm1 \n\t"
130 "packssdw %%mm4, %%mm3 \n\t"
131 "movq %%mm0, (%0, %%"REG_D") \n\t"
132 "movq %%mm1, 8(%0, %%"REG_D") \n\t"
133 "movq %%mm3, 16(%0, %%"REG_D") \n\t"
134 "movd 1032(%1, %%"REG_S"), %%mm1\n\t"
135 "punpckldq 12(%1, %%"REG_S"), %%mm1\n\t"
136 "movd 2060(%1, %%"REG_S"), %%mm2\n\t"
137 "movq %%mm7, %%mm3 \n\t"
138 "punpckldq 1036(%1, %%"REG_S"), %%mm3\n\t"
139 "pxor %%mm0, %%mm0 \n\t"
140 "psubd %%mm7, %%mm1 \n\t"
141 "psubd %%mm5, %%mm2 \n\t"
142 "psubd %%mm7, %%mm3 \n\t"
143 "packssdw %%mm1, %%mm0 \n\t"
144 "packssdw %%mm3, %%mm2 \n\t"
145 "movq %%mm0, 24(%0, %%"REG_D") \n\t"
146 "movq %%mm2, 32(%0, %%"REG_D") \n\t"
148 "add $16, %%"REG_S" \n\t"
149 " jnz 1b \n\t"
150 "emms \n\t"
151 :: "r" (s16+1280), "r" (f+256)
152 :"%"REG_S, "%"REG_D, "memory"
154 return 5*256;
157 static int a52_resample_2F_2R_to_4_MMX(float * _f, int16_t * s16){
158 int32_t * f = (int32_t *) _f;
159 asm volatile(
160 "mov $-1024, %%"REG_S" \n\t"
161 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
162 "1: \n\t"
163 "movq (%1, %%"REG_S"), %%mm0 \n\t"
164 "movq 8(%1, %%"REG_S"), %%mm1 \n\t"
165 "movq 1024(%1, %%"REG_S"), %%mm2\n\t"
166 "movq 1032(%1, %%"REG_S"), %%mm3\n\t"
167 "psubd %%mm7, %%mm0 \n\t"
168 "psubd %%mm7, %%mm1 \n\t"
169 "psubd %%mm7, %%mm2 \n\t"
170 "psubd %%mm7, %%mm3 \n\t"
171 "packssdw %%mm1, %%mm0 \n\t"
172 "packssdw %%mm3, %%mm2 \n\t"
173 "movq 2048(%1, %%"REG_S"), %%mm3\n\t"
174 "movq 2056(%1, %%"REG_S"), %%mm4\n\t"
175 "movq 3072(%1, %%"REG_S"), %%mm5\n\t"
176 "movq 3080(%1, %%"REG_S"), %%mm6\n\t"
177 "psubd %%mm7, %%mm3 \n\t"
178 "psubd %%mm7, %%mm4 \n\t"
179 "psubd %%mm7, %%mm5 \n\t"
180 "psubd %%mm7, %%mm6 \n\t"
181 "packssdw %%mm4, %%mm3 \n\t"
182 "packssdw %%mm6, %%mm5 \n\t"
183 "movq %%mm0, %%mm1 \n\t"
184 "movq %%mm3, %%mm4 \n\t"
185 "punpcklwd %%mm2, %%mm0 \n\t"
186 "punpckhwd %%mm2, %%mm1 \n\t"
187 "punpcklwd %%mm5, %%mm3 \n\t"
188 "punpckhwd %%mm5, %%mm4 \n\t"
189 "movq %%mm0, %%mm2 \n\t"
190 "movq %%mm1, %%mm5 \n\t"
191 "punpckldq %%mm3, %%mm0 \n\t"
192 "punpckhdq %%mm3, %%mm2 \n\t"
193 "punpckldq %%mm4, %%mm1 \n\t"
194 "punpckhdq %%mm4, %%mm5 \n\t"
195 "movq %%mm0, (%0, %%"REG_S",2) \n\t"
196 "movq %%mm2, 8(%0, %%"REG_S",2) \n\t"
197 "movq %%mm1, 16(%0, %%"REG_S",2)\n\t"
198 "movq %%mm5, 24(%0, %%"REG_S",2)\n\t"
199 "add $16, %%"REG_S" \n\t"
200 " jnz 1b \n\t"
201 "emms \n\t"
202 :: "r" (s16+1024), "r" (f+256)
203 :"%"REG_S, "memory"
205 return 4*256;
208 static int a52_resample_3F_2R_to_5_MMX(float * _f, int16_t * s16){
209 int32_t * f = (int32_t *) _f;
210 asm volatile(
211 "mov $-1024, %%"REG_S" \n\t"
212 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
213 "1: \n\t"
214 "movd (%1, %%"REG_S"), %%mm0 \n\t"
215 "punpckldq 2048(%1, %%"REG_S"), %%mm0\n\t"
216 "movd 3072(%1, %%"REG_S"), %%mm1\n\t"
217 "punpckldq 4096(%1, %%"REG_S"), %%mm1\n\t"
218 "movd 1024(%1, %%"REG_S"), %%mm2\n\t"
219 "punpckldq 4(%1, %%"REG_S"), %%mm2\n\t"
220 "movd 2052(%1, %%"REG_S"), %%mm3\n\t"
221 "punpckldq 3076(%1, %%"REG_S"), %%mm3\n\t"
222 "movd 4100(%1, %%"REG_S"), %%mm4\n\t"
223 "punpckldq 1028(%1, %%"REG_S"), %%mm4\n\t"
224 "movd 8(%1, %%"REG_S"), %%mm5 \n\t"
225 "punpckldq 2056(%1, %%"REG_S"), %%mm5\n\t"
226 "lea (%%"REG_S", %%"REG_S", 4), %%"REG_D"\n\t"
227 "sar $1, %%"REG_D" \n\t"
228 "psubd %%mm7, %%mm0 \n\t"
229 "psubd %%mm7, %%mm1 \n\t"
230 "psubd %%mm7, %%mm2 \n\t"
231 "psubd %%mm7, %%mm3 \n\t"
232 "psubd %%mm7, %%mm4 \n\t"
233 "psubd %%mm7, %%mm5 \n\t"
234 "packssdw %%mm1, %%mm0 \n\t"
235 "packssdw %%mm3, %%mm2 \n\t"
236 "packssdw %%mm5, %%mm4 \n\t"
237 "movq %%mm0, (%0, %%"REG_D") \n\t"
238 "movq %%mm2, 8(%0, %%"REG_D") \n\t"
239 "movq %%mm4, 16(%0, %%"REG_D") \n\t"
241 "movd 3080(%1, %%"REG_S"), %%mm0\n\t"
242 "punpckldq 4104(%1, %%"REG_S"), %%mm0\n\t"
243 "movd 1032(%1, %%"REG_S"), %%mm1\n\t"
244 "punpckldq 12(%1, %%"REG_S"), %%mm1\n\t"
245 "movd 2060(%1, %%"REG_S"), %%mm2\n\t"
246 "punpckldq 3084(%1, %%"REG_S"), %%mm2\n\t"
247 "movd 4108(%1, %%"REG_S"), %%mm3\n\t"
248 "punpckldq 1036(%1, %%"REG_S"), %%mm3\n\t"
249 "psubd %%mm7, %%mm0 \n\t"
250 "psubd %%mm7, %%mm1 \n\t"
251 "psubd %%mm7, %%mm2 \n\t"
252 "psubd %%mm7, %%mm3 \n\t"
253 "packssdw %%mm1, %%mm0 \n\t"
254 "packssdw %%mm3, %%mm2 \n\t"
255 "movq %%mm0, 24(%0, %%"REG_D") \n\t"
256 "movq %%mm2, 32(%0, %%"REG_D") \n\t"
258 "add $16, %%"REG_S" \n\t"
259 " jnz 1b \n\t"
260 "emms \n\t"
261 :: "r" (s16+1280), "r" (f+256)
262 :"%"REG_S, "%"REG_D, "memory"
264 return 5*256;
267 static int a52_resample_MONO_LFE_to_6_MMX(float * _f, int16_t * s16){
268 int32_t * f = (int32_t *) _f;
269 asm volatile(
270 "mov $-1024, %%"REG_S" \n\t"
271 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
272 "pxor %%mm6, %%mm6 \n\t"
273 "1: \n\t"
274 "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
275 "movq 1032(%1, %%"REG_S"), %%mm1\n\t"
276 "movq (%1, %%"REG_S"), %%mm2 \n\t"
277 "movq 8(%1, %%"REG_S"), %%mm3 \n\t"
278 "psubd %%mm7, %%mm0 \n\t"
279 "psubd %%mm7, %%mm1 \n\t"
280 "psubd %%mm7, %%mm2 \n\t"
281 "psubd %%mm7, %%mm3 \n\t"
282 "packssdw %%mm1, %%mm0 \n\t"
283 "packssdw %%mm3, %%mm2 \n\t"
284 "movq %%mm0, %%mm1 \n\t"
285 "punpcklwd %%mm2, %%mm0 \n\t"
286 "punpckhwd %%mm2, %%mm1 \n\t"
287 "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t"
288 "movq %%mm6, (%0, %%"REG_D") \n\t"
289 "movd %%mm0, 8(%0, %%"REG_D") \n\t"
290 "punpckhdq %%mm0, %%mm0 \n\t"
291 "movq %%mm6, 12(%0, %%"REG_D") \n\t"
292 "movd %%mm0, 20(%0, %%"REG_D") \n\t"
293 "movq %%mm6, 24(%0, %%"REG_D") \n\t"
294 "movd %%mm1, 32(%0, %%"REG_D") \n\t"
295 "punpckhdq %%mm1, %%mm1 \n\t"
296 "movq %%mm6, 36(%0, %%"REG_D") \n\t"
297 "movd %%mm1, 44(%0, %%"REG_D") \n\t"
298 "add $16, %%"REG_S" \n\t"
299 " jnz 1b \n\t"
300 "emms \n\t"
301 :: "r" (s16+1536), "r" (f+256)
302 :"%"REG_S, "%"REG_D, "memory"
304 return 6*256;
307 static int a52_resample_STEREO_LFE_to_6_MMX(float * _f, int16_t * s16){
308 int32_t * f = (int32_t *) _f;
309 asm volatile(
310 "mov $-1024, %%"REG_S" \n\t"
311 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
312 "pxor %%mm6, %%mm6 \n\t"
313 "1: \n\t"
314 "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
315 "movq 2048(%1, %%"REG_S"), %%mm1\n\t"
316 "movq (%1, %%"REG_S"), %%mm5 \n\t"
317 "psubd %%mm7, %%mm0 \n\t"
318 "psubd %%mm7, %%mm1 \n\t"
319 "psubd %%mm7, %%mm5 \n\t"
320 "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t"
322 "pxor %%mm4, %%mm4 \n\t"
323 "packssdw %%mm5, %%mm0 \n\t" // FfAa
324 "packssdw %%mm4, %%mm1 \n\t" // 00Bb
325 "punpckhwd %%mm0, %%mm4 \n\t" // F0f0
326 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
327 "movq %%mm0, %%mm1 \n\t" // BAba
328 "punpckldq %%mm4, %%mm3 \n\t" // f0XX
329 "punpckldq %%mm6, %%mm0 \n\t" // 00ba
330 "punpckhdq %%mm1, %%mm3 \n\t" // BAf0
332 "movq %%mm0, (%0, %%"REG_D") \n\t" // 00ba
333 "punpckhdq %%mm4, %%mm0 \n\t" // F000
334 "movq %%mm3, 8(%0, %%"REG_D") \n\t" // BAf0
335 "movq %%mm0, 16(%0, %%"REG_D") \n\t" // F000
336 "add $8, %%"REG_S" \n\t"
337 " jnz 1b \n\t"
338 "emms \n\t"
339 :: "r" (s16+1536), "r" (f+256)
340 :"%"REG_S, "%"REG_D, "memory"
342 return 6*256;
345 static int a52_resample_3F_LFE_to_6_MMX(float * _f, int16_t * s16){
346 int32_t * f = (int32_t *) _f;
347 asm volatile(
348 "mov $-1024, %%"REG_S" \n\t"
349 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
350 "pxor %%mm6, %%mm6 \n\t"
351 "1: \n\t"
352 "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
353 "movq 3072(%1, %%"REG_S"), %%mm1\n\t"
354 "movq 2048(%1, %%"REG_S"), %%mm4\n\t"
355 "movq (%1, %%"REG_S"), %%mm5 \n\t"
356 "psubd %%mm7, %%mm0 \n\t"
357 "psubd %%mm7, %%mm1 \n\t"
358 "psubd %%mm7, %%mm4 \n\t"
359 "psubd %%mm7, %%mm5 \n\t"
360 "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t"
362 "packssdw %%mm4, %%mm0 \n\t" // EeAa
363 "packssdw %%mm5, %%mm1 \n\t" // FfBb
364 "movq %%mm0, %%mm2 \n\t" // EeAa
365 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
366 "punpckhwd %%mm1, %%mm2 \n\t" // FEfe
367 "movq %%mm0, %%mm1 \n\t" // BAba
368 "punpckldq %%mm6, %%mm0 \n\t" // 00ba
369 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
371 "movq %%mm0, (%0, %%"REG_D") \n\t"
372 "punpckhdq %%mm2, %%mm0 \n\t" // FE00
373 "punpckldq %%mm1, %%mm2 \n\t" // BAfe
374 "movq %%mm2, 8(%0, %%"REG_D") \n\t"
375 "movq %%mm0, 16(%0, %%"REG_D") \n\t"
376 "add $8, %%"REG_S" \n\t"
377 " jnz 1b \n\t"
378 "emms \n\t"
379 :: "r" (s16+1536), "r" (f+256)
380 :"%"REG_S, "%"REG_D, "memory"
382 return 6*256;
385 static int a52_resample_2F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
386 int32_t * f = (int32_t *) _f;
387 asm volatile(
388 "mov $-1024, %%"REG_S" \n\t"
389 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
390 // "pxor %%mm6, %%mm6 \n\t"
391 "1: \n\t"
392 "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
393 "movq 2048(%1, %%"REG_S"), %%mm1\n\t"
394 "movq 3072(%1, %%"REG_S"), %%mm2\n\t"
395 "movq 4096(%1, %%"REG_S"), %%mm3\n\t"
396 "movq (%1, %%"REG_S"), %%mm5 \n\t"
397 "psubd %%mm7, %%mm0 \n\t"
398 "psubd %%mm7, %%mm1 \n\t"
399 "psubd %%mm7, %%mm2 \n\t"
400 "psubd %%mm7, %%mm3 \n\t"
401 "psubd %%mm7, %%mm5 \n\t"
402 "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t"
404 "packssdw %%mm2, %%mm0 \n\t" // CcAa
405 "packssdw %%mm3, %%mm1 \n\t" // DdBb
406 "packssdw %%mm5, %%mm5 \n\t" // FfFf
407 "movq %%mm0, %%mm2 \n\t" // CcAa
408 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
409 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
410 "pxor %%mm4, %%mm4 \n\t" // 0000
411 "punpcklwd %%mm5, %%mm4 \n\t" // F0f0
412 "movq %%mm0, %%mm1 \n\t" // BAba
413 "movq %%mm4, %%mm3 \n\t" // F0f0
414 "punpckldq %%mm2, %%mm0 \n\t" // dcba
415 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
416 "punpckldq %%mm1, %%mm4 \n\t" // BAf0
417 "punpckhdq %%mm3, %%mm2 \n\t" // F0DC
419 "movq %%mm0, (%0, %%"REG_D") \n\t"
420 "movq %%mm4, 8(%0, %%"REG_D") \n\t"
421 "movq %%mm2, 16(%0, %%"REG_D") \n\t"
422 "add $8, %%"REG_S" \n\t"
423 " jnz 1b \n\t"
424 "emms \n\t"
425 :: "r" (s16+1536), "r" (f+256)
426 :"%"REG_S, "%"REG_D, "memory"
428 return 6*256;
431 static int a52_resample_3F_2R_LFE_to_6_MMX(float * _f, int16_t * s16){
432 int32_t * f = (int32_t *) _f;
433 asm volatile(
434 "mov $-1024, %%"REG_S" \n\t"
435 "movq "MANGLE(magicF2W)", %%mm7 \n\t"
436 // "pxor %%mm6, %%mm6 \n\t"
437 "1: \n\t"
438 "movq 1024(%1, %%"REG_S"), %%mm0\n\t"
439 "movq 3072(%1, %%"REG_S"), %%mm1\n\t"
440 "movq 4096(%1, %%"REG_S"), %%mm2\n\t"
441 "movq 5120(%1, %%"REG_S"), %%mm3\n\t"
442 "movq 2048(%1, %%"REG_S"), %%mm4\n\t"
443 "movq (%1, %%"REG_S"), %%mm5 \n\t"
444 "psubd %%mm7, %%mm0 \n\t"
445 "psubd %%mm7, %%mm1 \n\t"
446 "psubd %%mm7, %%mm2 \n\t"
447 "psubd %%mm7, %%mm3 \n\t"
448 "psubd %%mm7, %%mm4 \n\t"
449 "psubd %%mm7, %%mm5 \n\t"
450 "lea (%%"REG_S", %%"REG_S", 2), %%"REG_D"\n\t"
452 "packssdw %%mm2, %%mm0 \n\t" // CcAa
453 "packssdw %%mm3, %%mm1 \n\t" // DdBb
454 "packssdw %%mm4, %%mm4 \n\t" // EeEe
455 "packssdw %%mm5, %%mm5 \n\t" // FfFf
456 "movq %%mm0, %%mm2 \n\t" // CcAa
457 "punpcklwd %%mm1, %%mm0 \n\t" // BAba
458 "punpckhwd %%mm1, %%mm2 \n\t" // DCdc
459 "punpcklwd %%mm5, %%mm4 \n\t" // FEfe
460 "movq %%mm0, %%mm1 \n\t" // BAba
461 "movq %%mm4, %%mm3 \n\t" // FEfe
462 "punpckldq %%mm2, %%mm0 \n\t" // dcba
463 "punpckhdq %%mm1, %%mm1 \n\t" // BABA
464 "punpckldq %%mm1, %%mm4 \n\t" // BAfe
465 "punpckhdq %%mm3, %%mm2 \n\t" // FEDC
467 "movq %%mm0, (%0, %%"REG_D") \n\t"
468 "movq %%mm4, 8(%0, %%"REG_D") \n\t"
469 "movq %%mm2, 16(%0, %%"REG_D") \n\t"
470 "add $8, %%"REG_S" \n\t"
471 " jnz 1b \n\t"
472 "emms \n\t"
473 :: "r" (s16+1536), "r" (f+256)
474 :"%"REG_S, "%"REG_D, "memory"
476 return 6*256;
480 static void* a52_resample_MMX(int flags, int ch){
481 switch (flags) {
482 case A52_MONO:
483 if(ch==5) return a52_resample_MONO_to_5_MMX;
484 break;
485 case A52_CHANNEL:
486 case A52_STEREO:
487 case A52_DOLBY:
488 if(ch==2) return a52_resample_STEREO_to_2_MMX;
489 break;
490 case A52_3F:
491 if(ch==5) return a52_resample_3F_to_5_MMX;
492 break;
493 case A52_2F2R:
494 if(ch==4) return a52_resample_2F_2R_to_4_MMX;
495 break;
496 case A52_3F2R:
497 if(ch==5) return a52_resample_3F_2R_to_5_MMX;
498 break;
499 case A52_MONO | A52_LFE:
500 if(ch==6) return a52_resample_MONO_LFE_to_6_MMX;
501 break;
502 case A52_CHANNEL | A52_LFE:
503 case A52_STEREO | A52_LFE:
504 case A52_DOLBY | A52_LFE:
505 if(ch==6) return a52_resample_STEREO_LFE_to_6_MMX;
506 break;
507 case A52_3F | A52_LFE:
508 if(ch==6) return a52_resample_3F_LFE_to_6_MMX;
509 break;
510 case A52_2F2R | A52_LFE:
511 if(ch==6) return a52_resample_2F_2R_LFE_to_6_MMX;
512 break;
513 case A52_3F2R | A52_LFE:
514 if(ch==6) return a52_resample_3F_2R_LFE_to_6_MMX;
515 break;
517 return NULL;