Add a comment that explains why this header has no multiple inclusion guards.
[mplayer/greg.git] / mp3lib / dct64_k7.c
blobf668f8b27d5e82c223b4776bd56361a2ea8d366d
1 /*
2 * This code was taken from http://www.mpg123.org
3 * See ChangeLog of mpg123-0.59s-pre.1 for detail
4 * Applied to mplayer by Nick Kurshev <nickols_k@mail.ru>
5 * Partial 3dnowex-DSP! optimization by Nick Kurshev
7 * TODO: optimize scalar 3dnow! code
8 * Warning: Phases 7 & 8 are not tested
9 */
10 #define real float /* ugly - but only way */
12 #include "config.h"
13 #include "mangle.h"
15 static unsigned long long int attribute_used __attribute__((aligned(8))) x_plus_minus_3dnow = 0x8000000000000000ULL;
16 static float attribute_used plus_1f = 1.0;
18 void dct64_MMX_3dnowex(short *a,short *b,real *c)
20 char tmp[256];
21 __asm __volatile(
22 " movl %2,%%eax\n\t"
24 " leal 128+%3,%%edx\n\t"
25 " movl %0,%%esi\n\t"
26 " movl %1,%%edi\n\t"
27 " movl $"MANGLE(costab_mmx)",%%ebx\n\t"
28 " leal %3,%%ecx\n\t"
30 /* Phase 1*/
31 " movq (%%eax), %%mm0\n\t"
32 " movq 8(%%eax), %%mm4\n\t"
33 " movq %%mm0, %%mm3\n\t"
34 " movq %%mm4, %%mm7\n\t"
35 " pswapd 120(%%eax), %%mm1\n\t"
36 " pswapd 112(%%eax), %%mm5\n\t"
37 " pfadd %%mm1, %%mm0\n\t"
38 " pfadd %%mm5, %%mm4\n\t"
39 " movq %%mm0, (%%edx)\n\t"
40 " movq %%mm4, 8(%%edx)\n\t"
41 " pfsub %%mm1, %%mm3\n\t"
42 " pfsub %%mm5, %%mm7\n\t"
43 " pfmul (%%ebx), %%mm3\n\t"
44 " pfmul 8(%%ebx), %%mm7\n\t"
45 " pswapd %%mm3, %%mm3\n\t"
46 " pswapd %%mm7, %%mm7\n\t"
47 " movq %%mm3, 120(%%edx)\n\t"
48 " movq %%mm7, 112(%%edx)\n\t"
50 " movq 16(%%eax), %%mm0\n\t"
51 " movq 24(%%eax), %%mm4\n\t"
52 " movq %%mm0, %%mm3\n\t"
53 " movq %%mm4, %%mm7\n\t"
54 " pswapd 104(%%eax), %%mm1\n\t"
55 " pswapd 96(%%eax), %%mm5\n\t"
56 " pfadd %%mm1, %%mm0\n\t"
57 " pfadd %%mm5, %%mm4\n\t"
58 " movq %%mm0, 16(%%edx)\n\t"
59 " movq %%mm4, 24(%%edx)\n\t"
60 " pfsub %%mm1, %%mm3\n\t"
61 " pfsub %%mm5, %%mm7\n\t"
62 " pfmul 16(%%ebx), %%mm3\n\t"
63 " pfmul 24(%%ebx), %%mm7\n\t"
64 " pswapd %%mm3, %%mm3\n\t"
65 " pswapd %%mm7, %%mm7\n\t"
66 " movq %%mm3, 104(%%edx)\n\t"
67 " movq %%mm7, 96(%%edx)\n\t"
69 " movq 32(%%eax), %%mm0\n\t"
70 " movq 40(%%eax), %%mm4\n\t"
71 " movq %%mm0, %%mm3\n\t"
72 " movq %%mm4, %%mm7\n\t"
73 " pswapd 88(%%eax), %%mm1\n\t"
74 " pswapd 80(%%eax), %%mm5\n\t"
75 " pfadd %%mm1, %%mm0\n\t"
76 " pfadd %%mm5, %%mm4\n\t"
77 " movq %%mm0, 32(%%edx)\n\t"
78 " movq %%mm4, 40(%%edx)\n\t"
79 " pfsub %%mm1, %%mm3\n\t"
80 " pfsub %%mm5, %%mm7\n\t"
81 " pfmul 32(%%ebx), %%mm3\n\t"
82 " pfmul 40(%%ebx), %%mm7\n\t"
83 " pswapd %%mm3, %%mm3\n\t"
84 " pswapd %%mm7, %%mm7\n\t"
85 " movq %%mm3, 88(%%edx)\n\t"
86 " movq %%mm7, 80(%%edx)\n\t"
88 " movq 48(%%eax), %%mm0\n\t"
89 " movq 56(%%eax), %%mm4\n\t"
90 " movq %%mm0, %%mm3\n\t"
91 " movq %%mm4, %%mm7\n\t"
92 " pswapd 72(%%eax), %%mm1\n\t"
93 " pswapd 64(%%eax), %%mm5\n\t"
94 " pfadd %%mm1, %%mm0\n\t"
95 " pfadd %%mm5, %%mm4\n\t"
96 " movq %%mm0, 48(%%edx)\n\t"
97 " movq %%mm4, 56(%%edx)\n\t"
98 " pfsub %%mm1, %%mm3\n\t"
99 " pfsub %%mm5, %%mm7\n\t"
100 " pfmul 48(%%ebx), %%mm3\n\t"
101 " pfmul 56(%%ebx), %%mm7\n\t"
102 " pswapd %%mm3, %%mm3\n\t"
103 " pswapd %%mm7, %%mm7\n\t"
104 " movq %%mm3, 72(%%edx)\n\t"
105 " movq %%mm7, 64(%%edx)\n\t"
107 /* Phase 2*/
109 " movq (%%edx), %%mm0\n\t"
110 " movq 8(%%edx), %%mm4\n\t"
111 " movq %%mm0, %%mm3\n\t"
112 " movq %%mm4, %%mm7\n\t"
113 " pswapd 56(%%edx), %%mm1\n\t"
114 " pswapd 48(%%edx), %%mm5\n\t"
115 " pfadd %%mm1, %%mm0\n\t"
116 " pfadd %%mm5, %%mm4\n\t"
117 " movq %%mm0, (%%ecx)\n\t"
118 " movq %%mm4, 8(%%ecx)\n\t"
119 " pfsub %%mm1, %%mm3\n\t"
120 " pfsub %%mm5, %%mm7\n\t"
121 " pfmul 64(%%ebx), %%mm3\n\t"
122 " pfmul 72(%%ebx), %%mm7\n\t"
123 " pswapd %%mm3, %%mm3\n\t"
124 " pswapd %%mm7, %%mm7\n\t"
125 " movq %%mm3, 56(%%ecx)\n\t"
126 " movq %%mm7, 48(%%ecx)\n\t"
128 " movq 16(%%edx), %%mm0\n\t"
129 " movq 24(%%edx), %%mm4\n\t"
130 " movq %%mm0, %%mm3\n\t"
131 " movq %%mm4, %%mm7\n\t"
132 " pswapd 40(%%edx), %%mm1\n\t"
133 " pswapd 32(%%edx), %%mm5\n\t"
134 " pfadd %%mm1, %%mm0\n\t"
135 " pfadd %%mm5, %%mm4\n\t"
136 " movq %%mm0, 16(%%ecx)\n\t"
137 " movq %%mm4, 24(%%ecx)\n\t"
138 " pfsub %%mm1, %%mm3\n\t"
139 " pfsub %%mm5, %%mm7\n\t"
140 " pfmul 80(%%ebx), %%mm3\n\t"
141 " pfmul 88(%%ebx), %%mm7\n\t"
142 " pswapd %%mm3, %%mm3\n\t"
143 " pswapd %%mm7, %%mm7\n\t"
144 " movq %%mm3, 40(%%ecx)\n\t"
145 " movq %%mm7, 32(%%ecx)\n\t"
147 /* Phase 3*/
149 " movq 64(%%edx), %%mm0\n\t"
150 " movq 72(%%edx), %%mm4\n\t"
151 " movq %%mm0, %%mm3\n\t"
152 " movq %%mm4, %%mm7\n\t"
153 " pswapd 120(%%edx), %%mm1\n\t"
154 " pswapd 112(%%edx), %%mm5\n\t"
155 " pfadd %%mm1, %%mm0\n\t"
156 " pfadd %%mm5, %%mm4\n\t"
157 " movq %%mm0, 64(%%ecx)\n\t"
158 " movq %%mm4, 72(%%ecx)\n\t"
159 " pfsubr %%mm1, %%mm3\n\t"
160 " pfsubr %%mm5, %%mm7\n\t"
161 " pfmul 64(%%ebx), %%mm3\n\t"
162 " pfmul 72(%%ebx), %%mm7\n\t"
163 " pswapd %%mm3, %%mm3\n\t"
164 " pswapd %%mm7, %%mm7\n\t"
165 " movq %%mm3, 120(%%ecx)\n\t"
166 " movq %%mm7, 112(%%ecx)\n\t"
168 " movq 80(%%edx), %%mm0\n\t"
169 " movq 88(%%edx), %%mm4\n\t"
170 " movq %%mm0, %%mm3\n\t"
171 " movq %%mm4, %%mm7\n\t"
172 " pswapd 104(%%edx), %%mm1\n\t"
173 " pswapd 96(%%edx), %%mm5\n\t"
174 " pfadd %%mm1, %%mm0\n\t"
175 " pfadd %%mm5, %%mm4\n\t"
176 " movq %%mm0, 80(%%ecx)\n\t"
177 " movq %%mm4, 88(%%ecx)\n\t"
178 " pfsubr %%mm1, %%mm3\n\t"
179 " pfsubr %%mm5, %%mm7\n\t"
180 " pfmul 80(%%ebx), %%mm3\n\t"
181 " pfmul 88(%%ebx), %%mm7\n\t"
182 " pswapd %%mm3, %%mm3\n\t"
183 " pswapd %%mm7, %%mm7\n\t"
184 " movq %%mm3, 104(%%ecx)\n\t"
185 " movq %%mm7, 96(%%ecx)\n\t"
187 /* Phase 4*/
189 " movq 96(%%ebx), %%mm2\n\t"
190 " movq 104(%%ebx), %%mm6\n\t"
192 " movq (%%ecx), %%mm0\n\t"
193 " movq 8(%%ecx), %%mm4\n\t"
194 " movq %%mm0, %%mm3\n\t"
195 " movq %%mm4, %%mm7\n\t"
196 " pswapd 24(%%ecx), %%mm1\n\t"
197 " pswapd 16(%%ecx), %%mm5\n\t"
198 " pfadd %%mm1, %%mm0\n\t"
199 " pfadd %%mm5, %%mm4\n\t"
200 " movq %%mm0, (%%edx)\n\t"
201 " movq %%mm4, 8(%%edx)\n\t"
202 " pfsub %%mm1, %%mm3\n\t"
203 " pfsub %%mm5, %%mm7\n\t"
204 " pfmul %%mm2, %%mm3\n\t"
205 " pfmul %%mm6, %%mm7\n\t"
206 " pswapd %%mm3, %%mm3\n\t"
207 " pswapd %%mm7, %%mm7\n\t"
208 " movq %%mm3, 24(%%edx)\n\t"
209 " movq %%mm7, 16(%%edx)\n\t"
211 " movq 32(%%ecx), %%mm0\n\t"
212 " movq 40(%%ecx), %%mm4\n\t"
213 " movq %%mm0, %%mm3\n\t"
214 " movq %%mm4, %%mm7\n\t"
215 " pswapd 56(%%ecx), %%mm1\n\t"
216 " pswapd 48(%%ecx), %%mm5\n\t"
217 " pfadd %%mm1, %%mm0\n\t"
218 " pfadd %%mm5, %%mm4\n\t"
219 " movq %%mm0, 32(%%edx)\n\t"
220 " movq %%mm4, 40(%%edx)\n\t"
221 " pfsubr %%mm1, %%mm3\n\t"
222 " pfsubr %%mm5, %%mm7\n\t"
223 " pfmul %%mm2, %%mm3\n\t"
224 " pfmul %%mm6, %%mm7\n\t"
225 " pswapd %%mm3, %%mm3\n\t"
226 " pswapd %%mm7, %%mm7\n\t"
227 " movq %%mm3, 56(%%edx)\n\t"
228 " movq %%mm7, 48(%%edx)\n\t"
230 " movq 64(%%ecx), %%mm0\n\t"
231 " movq 72(%%ecx), %%mm4\n\t"
232 " movq %%mm0, %%mm3\n\t"
233 " movq %%mm4, %%mm7\n\t"
234 " pswapd 88(%%ecx), %%mm1\n\t"
235 " pswapd 80(%%ecx), %%mm5\n\t"
236 " pfadd %%mm1, %%mm0\n\t"
237 " pfadd %%mm5, %%mm4\n\t"
238 " movq %%mm0, 64(%%edx)\n\t"
239 " movq %%mm4, 72(%%edx)\n\t"
240 " pfsub %%mm1, %%mm3\n\t"
241 " pfsub %%mm5, %%mm7\n\t"
242 " pfmul %%mm2, %%mm3\n\t"
243 " pfmul %%mm6, %%mm7\n\t"
244 " pswapd %%mm3, %%mm3\n\t"
245 " pswapd %%mm7, %%mm7\n\t"
246 " movq %%mm3, 88(%%edx)\n\t"
247 " movq %%mm7, 80(%%edx)\n\t"
249 " movq 96(%%ecx), %%mm0\n\t"
250 " movq 104(%%ecx), %%mm4\n\t"
251 " movq %%mm0, %%mm3\n\t"
252 " movq %%mm4, %%mm7\n\t"
253 " pswapd 120(%%ecx), %%mm1\n\t"
254 " pswapd 112(%%ecx), %%mm5\n\t"
255 " pfadd %%mm1, %%mm0\n\t"
256 " pfadd %%mm5, %%mm4\n\t"
257 " movq %%mm0, 96(%%edx)\n\t"
258 " movq %%mm4, 104(%%edx)\n\t"
259 " pfsubr %%mm1, %%mm3\n\t"
260 " pfsubr %%mm5, %%mm7\n\t"
261 " pfmul %%mm2, %%mm3\n\t"
262 " pfmul %%mm6, %%mm7\n\t"
263 " pswapd %%mm3, %%mm3\n\t"
264 " pswapd %%mm7, %%mm7\n\t"
265 " movq %%mm3, 120(%%edx)\n\t"
266 " movq %%mm7, 112(%%edx)\n\t"
268 /* Phase 5 */
270 " movq 112(%%ebx), %%mm2\n\t"
272 " movq (%%edx), %%mm0\n\t"
273 " movq 16(%%edx), %%mm4\n\t"
274 " movq %%mm0, %%mm3\n\t"
275 " movq %%mm4, %%mm7\n\t"
276 " pswapd 8(%%edx), %%mm1\n\t"
277 " pswapd 24(%%edx), %%mm5\n\t"
278 " pfadd %%mm1, %%mm0\n\t"
279 " pfadd %%mm5, %%mm4\n\t"
280 " movq %%mm0, (%%ecx)\n\t"
281 " movq %%mm4, 16(%%ecx)\n\t"
282 " pfsub %%mm1, %%mm3\n\t"
283 " pfsubr %%mm5, %%mm7\n\t"
284 " pfmul %%mm2, %%mm3\n\t"
285 " pfmul %%mm2, %%mm7\n\t"
286 " pswapd %%mm3, %%mm3\n\t"
287 " pswapd %%mm7, %%mm7\n\t"
288 " movq %%mm3, 8(%%ecx)\n\t"
289 " movq %%mm7, 24(%%ecx)\n\t"
291 " movq 32(%%edx), %%mm0\n\t"
292 " movq 48(%%edx), %%mm4\n\t"
293 " movq %%mm0, %%mm3\n\t"
294 " movq %%mm4, %%mm7\n\t"
295 " pswapd 40(%%edx), %%mm1\n\t"
296 " pswapd 56(%%edx), %%mm5\n\t"
297 " pfadd %%mm1, %%mm0\n\t"
298 " pfadd %%mm5, %%mm4\n\t"
299 " movq %%mm0, 32(%%ecx)\n\t"
300 " movq %%mm4, 48(%%ecx)\n\t"
301 " pfsub %%mm1, %%mm3\n\t"
302 " pfsubr %%mm5, %%mm7\n\t"
303 " pfmul %%mm2, %%mm3\n\t"
304 " pfmul %%mm2, %%mm7\n\t"
305 " pswapd %%mm3, %%mm3\n\t"
306 " pswapd %%mm7, %%mm7\n\t"
307 " movq %%mm3, 40(%%ecx)\n\t"
308 " movq %%mm7, 56(%%ecx)\n\t"
310 " movq 64(%%edx), %%mm0\n\t"
311 " movq 80(%%edx), %%mm4\n\t"
312 " movq %%mm0, %%mm3\n\t"
313 " movq %%mm4, %%mm7\n\t"
314 " pswapd 72(%%edx), %%mm1\n\t"
315 " pswapd 88(%%edx), %%mm5\n\t"
316 " pfadd %%mm1, %%mm0\n\t"
317 " pfadd %%mm5, %%mm4\n\t"
318 " movq %%mm0, 64(%%ecx)\n\t"
319 " movq %%mm4, 80(%%ecx)\n\t"
320 " pfsub %%mm1, %%mm3\n\t"
321 " pfsubr %%mm5, %%mm7\n\t"
322 " pfmul %%mm2, %%mm3\n\t"
323 " pfmul %%mm2, %%mm7\n\t"
324 " pswapd %%mm3, %%mm3\n\t"
325 " pswapd %%mm7, %%mm7\n\t"
326 " movq %%mm3, 72(%%ecx)\n\t"
327 " movq %%mm7, 88(%%ecx)\n\t"
329 " movq 96(%%edx), %%mm0\n\t"
330 " movq 112(%%edx), %%mm4\n\t"
331 " movq %%mm0, %%mm3\n\t"
332 " movq %%mm4, %%mm7\n\t"
333 " pswapd 104(%%edx), %%mm1\n\t"
334 " pswapd 120(%%edx), %%mm5\n\t"
335 " pfadd %%mm1, %%mm0\n\t"
336 " pfadd %%mm5, %%mm4\n\t"
337 " movq %%mm0, 96(%%ecx)\n\t"
338 " movq %%mm4, 112(%%ecx)\n\t"
339 " pfsub %%mm1, %%mm3\n\t"
340 " pfsubr %%mm5, %%mm7\n\t"
341 " pfmul %%mm2, %%mm3\n\t"
342 " pfmul %%mm2, %%mm7\n\t"
343 " pswapd %%mm3, %%mm3\n\t"
344 " pswapd %%mm7, %%mm7\n\t"
345 " movq %%mm3, 104(%%ecx)\n\t"
346 " movq %%mm7, 120(%%ecx)\n\t"
349 /* Phase 6. This is the end of easy road. */
350 /* Code below is coded in scalar mode. Should be optimized */
352 " movd "MANGLE(plus_1f)", %%mm6\n\t"
353 " punpckldq 120(%%ebx), %%mm6\n\t" /* mm6 = 1.0 | 120(%%ebx)*/
354 " movq "MANGLE(x_plus_minus_3dnow)", %%mm7\n\t" /* mm7 = +1 | -1 */
356 " movq 32(%%ecx), %%mm0\n\t"
357 " movq 64(%%ecx), %%mm2\n\t"
358 " movq %%mm0, %%mm1\n\t"
359 " movq %%mm2, %%mm3\n\t"
360 " pxor %%mm7, %%mm1\n\t"
361 " pxor %%mm7, %%mm3\n\t"
362 " pfacc %%mm1, %%mm0\n\t"
363 " pfacc %%mm3, %%mm2\n\t"
364 " pfmul %%mm6, %%mm0\n\t"
365 " pfmul %%mm6, %%mm2\n\t"
366 " movq %%mm0, 32(%%edx)\n\t"
367 " movq %%mm2, 64(%%edx)\n\t"
369 " movd 44(%%ecx), %%mm0\n\t"
370 " movd 40(%%ecx), %%mm2\n\t"
371 " movd 120(%%ebx), %%mm3\n\t"
372 " punpckldq 76(%%ecx), %%mm0\n\t"
373 " punpckldq 72(%%ecx), %%mm2\n\t"
374 " punpckldq %%mm3, %%mm3\n\t"
375 " movq %%mm0, %%mm4\n\t"
376 " movq %%mm2, %%mm5\n\t"
377 " pfsub %%mm2, %%mm0\n\t"
378 " pfmul %%mm3, %%mm0\n\t"
379 " movq %%mm0, %%mm1\n\t"
380 " pfadd %%mm5, %%mm0\n\t"
381 " pfadd %%mm4, %%mm0\n\t"
382 " movq %%mm0, %%mm2\n\t"
383 " punpckldq %%mm1, %%mm0\n\t"
384 " punpckhdq %%mm1, %%mm2\n\t"
385 " movq %%mm0, 40(%%edx)\n\t"
386 " movq %%mm2, 72(%%edx)\n\t"
388 " movd 48(%%ecx), %%mm3\n\t"
389 " movd 60(%%ecx), %%mm2\n\t"
390 " pfsub 52(%%ecx), %%mm3\n\t"
391 " pfsub 56(%%ecx), %%mm2\n\t"
392 " pfmul 120(%%ebx), %%mm3\n\t"
393 " pfmul 120(%%ebx), %%mm2\n\t"
394 " movq %%mm2, %%mm1\n\t"
396 " pfadd 56(%%ecx), %%mm1\n\t"
397 " pfadd 60(%%ecx), %%mm1\n\t"
398 " movq %%mm1, %%mm0\n\t"
400 " pfadd 48(%%ecx), %%mm0\n\t"
401 " pfadd 52(%%ecx), %%mm0\n\t"
402 " pfadd %%mm3, %%mm1\n\t"
403 " punpckldq %%mm2, %%mm1\n\t"
404 " pfadd %%mm3, %%mm2\n\t"
405 " punpckldq %%mm2, %%mm0\n\t"
406 " movq %%mm1, 56(%%edx)\n\t"
407 " movq %%mm0, 48(%%edx)\n\t"
409 /*---*/
411 " movd 92(%%ecx), %%mm1\n\t"
412 " pfsub 88(%%ecx), %%mm1\n\t"
413 " pfmul 120(%%ebx), %%mm1\n\t"
414 " movd %%mm1, 92(%%edx)\n\t"
415 " pfadd 92(%%ecx), %%mm1\n\t"
416 " pfadd 88(%%ecx), %%mm1\n\t"
417 " movq %%mm1, %%mm0\n\t"
419 " pfadd 80(%%ecx), %%mm0\n\t"
420 " pfadd 84(%%ecx), %%mm0\n\t"
421 " movd %%mm0, 80(%%edx)\n\t"
423 " movd 80(%%ecx), %%mm0\n\t"
424 " pfsub 84(%%ecx), %%mm0\n\t"
425 " pfmul 120(%%ebx), %%mm0\n\t"
426 " pfadd %%mm0, %%mm1\n\t"
427 " pfadd 92(%%edx), %%mm0\n\t"
428 " punpckldq %%mm1, %%mm0\n\t"
429 " movq %%mm0, 84(%%edx)\n\t"
431 " movq 96(%%ecx), %%mm0\n\t"
432 " movq %%mm0, %%mm1\n\t"
433 " pxor %%mm7, %%mm1\n\t"
434 " pfacc %%mm1, %%mm0\n\t"
435 " pfmul %%mm6, %%mm0\n\t"
436 " movq %%mm0, 96(%%edx)\n\t"
438 " movd 108(%%ecx), %%mm0\n\t"
439 " pfsub 104(%%ecx), %%mm0\n\t"
440 " pfmul 120(%%ebx), %%mm0\n\t"
441 " movd %%mm0, 108(%%edx)\n\t"
442 " pfadd 104(%%ecx), %%mm0\n\t"
443 " pfadd 108(%%ecx), %%mm0\n\t"
444 " movd %%mm0, 104(%%edx)\n\t"
446 " movd 124(%%ecx), %%mm1\n\t"
447 " pfsub 120(%%ecx), %%mm1\n\t"
448 " pfmul 120(%%ebx), %%mm1\n\t"
449 " movd %%mm1, 124(%%edx)\n\t"
450 " pfadd 120(%%ecx), %%mm1\n\t"
451 " pfadd 124(%%ecx), %%mm1\n\t"
452 " movq %%mm1, %%mm0\n\t"
454 " pfadd 112(%%ecx), %%mm0\n\t"
455 " pfadd 116(%%ecx), %%mm0\n\t"
456 " movd %%mm0, 112(%%edx)\n\t"
458 " movd 112(%%ecx), %%mm0\n\t"
459 " pfsub 116(%%ecx), %%mm0\n\t"
460 " pfmul 120(%%ebx), %%mm0\n\t"
461 " pfadd %%mm0,%%mm1\n\t"
462 " pfadd 124(%%edx), %%mm0\n\t"
463 " punpckldq %%mm1, %%mm0\n\t"
464 " movq %%mm0, 116(%%edx)\n\t"
466 // this code is broken, there is nothing modifying the z flag above.
467 #if 0
468 " jnz .L01\n\t"
470 /* Phase 7*/
471 /* Code below is coded in scalar mode. Should be optimized */
473 " movd (%%ecx), %%mm0\n\t"
474 " pfadd 4(%%ecx), %%mm0\n\t"
475 " movd %%mm0, 1024(%%esi)\n\t"
477 " movd (%%ecx), %%mm0\n\t"
478 " pfsub 4(%%ecx), %%mm0\n\t"
479 " pfmul 120(%%ebx), %%mm0\n\t"
480 " movd %%mm0, (%%esi)\n\t"
481 " movd %%mm0, (%%edi)\n\t"
483 " movd 12(%%ecx), %%mm0\n\t"
484 " pfsub 8(%%ecx), %%mm0\n\t"
485 " pfmul 120(%%ebx), %%mm0\n\t"
486 " movd %%mm0, 512(%%edi)\n\t"
487 " pfadd 12(%%ecx), %%mm0\n\t"
488 " pfadd 8(%%ecx), %%mm0\n\t"
489 " movd %%mm0, 512(%%esi)\n\t"
491 " movd 16(%%ecx), %%mm0\n\t"
492 " pfsub 20(%%ecx), %%mm0\n\t"
493 " pfmul 120(%%ebx), %%mm0\n\t"
494 " movq %%mm0, %%mm3\n\t"
496 " movd 28(%%ecx), %%mm0\n\t"
497 " pfsub 24(%%ecx), %%mm0\n\t"
498 " pfmul 120(%%ebx), %%mm0\n\t"
499 " movd %%mm0, 768(%%edi)\n\t"
500 " movq %%mm0, %%mm2\n\t"
502 " pfadd 24(%%ecx), %%mm0\n\t"
503 " pfadd 28(%%ecx), %%mm0\n\t"
504 " movq %%mm0, %%mm1\n\t"
506 " pfadd 16(%%ecx), %%mm0\n\t"
507 " pfadd 20(%%ecx), %%mm0\n\t"
508 " movd %%mm0, 768(%%esi)\n\t"
509 " pfadd %%mm3, %%mm1\n\t"
510 " movd %%mm1, 256(%%esi)\n\t"
511 " pfadd %%mm3, %%mm2\n\t"
512 " movd %%mm2, 256(%%edi)\n\t"
514 /* Phase 8*/
516 " movq 32(%%edx), %%mm0\n\t"
517 " movq 48(%%edx), %%mm1\n\t"
518 " pfadd 48(%%edx), %%mm0\n\t"
519 " pfadd 40(%%edx), %%mm1\n\t"
520 " movd %%mm0, 896(%%esi)\n\t"
521 " movd %%mm1, 640(%%esi)\n\t"
522 " psrlq $32, %%mm0\n\t"
523 " psrlq $32, %%mm1\n\t"
524 " movd %%mm0, 128(%%edi)\n\t"
525 " movd %%mm1, 384(%%edi)\n\t"
527 " movd 40(%%edx), %%mm0\n\t"
528 " pfadd 56(%%edx), %%mm0\n\t"
529 " movd %%mm0, 384(%%esi)\n\t"
531 " movd 56(%%edx), %%mm0\n\t"
532 " pfadd 36(%%edx), %%mm0\n\t"
533 " movd %%mm0, 128(%%esi)\n\t"
535 " movd 60(%%edx), %%mm0\n\t"
536 " movd %%mm0, 896(%%edi)\n\t"
537 " pfadd 44(%%edx), %%mm0\n\t"
538 " movd %%mm0, 640(%%edi)\n\t"
540 " movq 96(%%edx), %%mm0\n\t"
541 " movq 112(%%edx), %%mm2\n\t"
542 " movq 104(%%edx), %%mm4\n\t"
543 " pfadd 112(%%edx), %%mm0\n\t"
544 " pfadd 104(%%edx), %%mm2\n\t"
545 " pfadd 120(%%edx), %%mm4\n\t"
546 " movq %%mm0, %%mm1\n\t"
547 " movq %%mm2, %%mm3\n\t"
548 " movq %%mm4, %%mm5\n\t"
549 " pfadd 64(%%edx), %%mm0\n\t"
550 " pfadd 80(%%edx), %%mm2\n\t"
551 " pfadd 72(%%edx), %%mm4\n\t"
552 " movd %%mm0, 960(%%esi)\n\t"
553 " movd %%mm2, 704(%%esi)\n\t"
554 " movd %%mm4, 448(%%esi)\n\t"
555 " psrlq $32, %%mm0\n\t"
556 " psrlq $32, %%mm2\n\t"
557 " psrlq $32, %%mm4\n\t"
558 " movd %%mm0, 64(%%edi)\n\t"
559 " movd %%mm2, 320(%%edi)\n\t"
560 " movd %%mm4, 576(%%edi)\n\t"
561 " pfadd 80(%%edx), %%mm1\n\t"
562 " pfadd 72(%%edx), %%mm3\n\t"
563 " pfadd 88(%%edx), %%mm5\n\t"
564 " movd %%mm1, 832(%%esi)\n\t"
565 " movd %%mm3, 576(%%esi)\n\t"
566 " movd %%mm5, 320(%%esi)\n\t"
567 " psrlq $32, %%mm1\n\t"
568 " psrlq $32, %%mm3\n\t"
569 " psrlq $32, %%mm5\n\t"
570 " movd %%mm1, 192(%%edi)\n\t"
571 " movd %%mm3, 448(%%edi)\n\t"
572 " movd %%mm5, 704(%%edi)\n\t"
574 " movd 120(%%edx), %%mm0\n\t"
575 " pfadd 100(%%edx), %%mm0\n\t"
576 " movq %%mm0, %%mm1\n\t"
577 " pfadd 88(%%edx), %%mm0\n\t"
578 " movd %%mm0, 192(%%esi)\n\t"
579 " pfadd 68(%%edx), %%mm1\n\t"
580 " movd %%mm1, 64(%%esi)\n\t"
582 " movd 124(%%edx), %%mm0\n\t"
583 " movd %%mm0, 960(%%edi)\n\t"
584 " pfadd 92(%%edx), %%mm0\n\t"
585 " movd %%mm0, 832(%%edi)\n\t"
587 " jmp .L_bye\n\t"
588 ".L01: \n\t"
589 #endif
590 /* Phase 9*/
592 " movq (%%ecx), %%mm0\n\t"
593 " movq %%mm0, %%mm1\n\t"
594 " pxor %%mm7, %%mm1\n\t"
595 " pfacc %%mm1, %%mm0\n\t"
596 " pfmul %%mm6, %%mm0\n\t"
597 " pf2iw %%mm0, %%mm0\n\t"
598 " movd %%mm0, %%eax\n\t"
599 " movw %%ax, 512(%%esi)\n\t"
600 " psrlq $32, %%mm0\n\t"
601 " movd %%mm0, %%eax\n\t"
602 " movw %%ax, (%%esi)\n\t"
604 " movd 12(%%ecx), %%mm0\n\t"
605 " pfsub 8(%%ecx), %%mm0\n\t"
606 " pfmul 120(%%ebx), %%mm0\n\t"
607 " pf2iw %%mm0, %%mm7\n\t"
608 " movd %%mm7, %%eax\n\t"
609 " movw %%ax, 256(%%edi)\n\t"
610 " pfadd 12(%%ecx), %%mm0\n\t"
611 " pfadd 8(%%ecx), %%mm0\n\t"
612 " pf2iw %%mm0, %%mm0\n\t"
613 " movd %%mm0, %%eax\n\t"
614 " movw %%ax, 256(%%esi)\n\t"
616 " movd 16(%%ecx), %%mm3\n\t"
617 " pfsub 20(%%ecx), %%mm3\n\t"
618 " pfmul 120(%%ebx), %%mm3\n\t"
619 " movq %%mm3, %%mm2\n\t"
621 " movd 28(%%ecx), %%mm2\n\t"
622 " pfsub 24(%%ecx), %%mm2\n\t"
623 " pfmul 120(%%ebx), %%mm2\n\t"
624 " movq %%mm2, %%mm1\n\t"
626 " pf2iw %%mm2, %%mm7\n\t"
627 " movd %%mm7, %%eax\n\t"
628 " movw %%ax, 384(%%edi)\n\t"
630 " pfadd 24(%%ecx), %%mm1\n\t"
631 " pfadd 28(%%ecx), %%mm1\n\t"
632 " movq %%mm1, %%mm0\n\t"
634 " pfadd 16(%%ecx), %%mm0\n\t"
635 " pfadd 20(%%ecx), %%mm0\n\t"
636 " pf2iw %%mm0, %%mm0\n\t"
637 " movd %%mm0, %%eax\n\t"
638 " movw %%ax, 384(%%esi)\n\t"
639 " pfadd %%mm3, %%mm1\n\t"
640 " pf2iw %%mm1, %%mm1\n\t"
641 " movd %%mm1, %%eax\n\t"
642 " movw %%ax, 128(%%esi)\n\t"
643 " pfadd %%mm3, %%mm2\n\t"
644 " pf2iw %%mm2, %%mm2\n\t"
645 " movd %%mm2, %%eax\n\t"
646 " movw %%ax, 128(%%edi)\n\t"
648 /* Phase 10*/
650 " movq 32(%%edx), %%mm0\n\t"
651 " movq 48(%%edx), %%mm1\n\t"
652 " pfadd 48(%%edx), %%mm0\n\t"
653 " pfadd 40(%%edx), %%mm1\n\t"
654 " pf2iw %%mm0, %%mm0\n\t"
655 " pf2iw %%mm1, %%mm1\n\t"
656 " movd %%mm0, %%eax\n\t"
657 " movd %%mm1, %%ecx\n\t"
658 " movw %%ax, 448(%%esi)\n\t"
659 " movw %%cx, 320(%%esi)\n\t"
660 " psrlq $32, %%mm0\n\t"
661 " psrlq $32, %%mm1\n\t"
662 " movd %%mm0, %%eax\n\t"
663 " movd %%mm1, %%ecx\n\t"
664 " movw %%ax, 64(%%edi)\n\t"
665 " movw %%cx, 192(%%edi)\n\t"
667 " movd 40(%%edx), %%mm3\n\t"
668 " movd 56(%%edx), %%mm4\n\t"
669 " movd 60(%%edx), %%mm0\n\t"
670 " movd 44(%%edx), %%mm2\n\t"
671 " movd 120(%%edx), %%mm5\n\t"
672 " punpckldq %%mm4, %%mm3\n\t"
673 " punpckldq 124(%%edx), %%mm0\n\t"
674 " pfadd 100(%%edx), %%mm5\n\t"
675 " punpckldq 36(%%edx), %%mm4\n\t"
676 " punpckldq 92(%%edx), %%mm2\n\t"
677 " movq %%mm5, %%mm6\n\t"
678 " pfadd %%mm4, %%mm3\n\t"
679 " pf2iw %%mm0, %%mm1\n\t"
680 " pf2iw %%mm3, %%mm3\n\t"
681 " pfadd 88(%%edx), %%mm5\n\t"
682 " movd %%mm1, %%eax\n\t"
683 " movd %%mm3, %%ecx\n\t"
684 " movw %%ax, 448(%%edi)\n\t"
685 " movw %%cx, 192(%%esi)\n\t"
686 " pf2iw %%mm5, %%mm5\n\t"
687 " psrlq $32, %%mm1\n\t"
688 " psrlq $32, %%mm3\n\t"
689 " movd %%mm5, %%ebx\n\t"
690 " movd %%mm1, %%eax\n\t"
691 " movd %%mm3, %%ecx\n\t"
692 " movw %%bx, 96(%%esi)\n\t"
693 " movw %%ax, 480(%%edi)\n\t"
694 " movw %%cx, 64(%%esi)\n\t"
695 " pfadd %%mm2, %%mm0\n\t"
696 " pf2iw %%mm0, %%mm0\n\t"
697 " movd %%mm0, %%eax\n\t"
698 " pfadd 68(%%edx), %%mm6\n\t"
699 " movw %%ax, 320(%%edi)\n\t"
700 " psrlq $32, %%mm0\n\t"
701 " pf2iw %%mm6, %%mm6\n\t"
702 " movd %%mm0, %%eax\n\t"
703 " movd %%mm6, %%ebx\n\t"
704 " movw %%ax, 416(%%edi)\n\t"
705 " movw %%bx, 32(%%esi)\n\t"
707 " movq 96(%%edx), %%mm0\n\t"
708 " movq 112(%%edx), %%mm2\n\t"
709 " movq 104(%%edx), %%mm4\n\t"
710 " pfadd %%mm2, %%mm0\n\t"
711 " pfadd %%mm4, %%mm2\n\t"
712 " pfadd 120(%%edx), %%mm4\n\t"
713 " movq %%mm0, %%mm1\n\t"
714 " movq %%mm2, %%mm3\n\t"
715 " movq %%mm4, %%mm5\n\t"
716 " pfadd 64(%%edx), %%mm0\n\t"
717 " pfadd 80(%%edx), %%mm2\n\t"
718 " pfadd 72(%%edx), %%mm4\n\t"
719 " pf2iw %%mm0, %%mm0\n\t"
720 " pf2iw %%mm2, %%mm2\n\t"
721 " pf2iw %%mm4, %%mm4\n\t"
722 " movd %%mm0, %%eax\n\t"
723 " movd %%mm2, %%ecx\n\t"
724 " movd %%mm4, %%ebx\n\t"
725 " movw %%ax, 480(%%esi)\n\t"
726 " movw %%cx, 352(%%esi)\n\t"
727 " movw %%bx, 224(%%esi)\n\t"
728 " psrlq $32, %%mm0\n\t"
729 " psrlq $32, %%mm2\n\t"
730 " psrlq $32, %%mm4\n\t"
731 " movd %%mm0, %%eax\n\t"
732 " movd %%mm2, %%ecx\n\t"
733 " movd %%mm4, %%ebx\n\t"
734 " movw %%ax, 32(%%edi)\n\t"
735 " movw %%cx, 160(%%edi)\n\t"
736 " movw %%bx, 288(%%edi)\n\t"
737 " pfadd 80(%%edx), %%mm1\n\t"
738 " pfadd 72(%%edx), %%mm3\n\t"
739 " pfadd 88(%%edx), %%mm5\n\t"
740 " pf2iw %%mm1, %%mm1\n\t"
741 " pf2iw %%mm3, %%mm3\n\t"
742 " pf2iw %%mm5, %%mm5\n\t"
743 " movd %%mm1, %%eax\n\t"
744 " movd %%mm3, %%ecx\n\t"
745 " movd %%mm5, %%ebx\n\t"
746 " movw %%ax, 416(%%esi)\n\t"
747 " movw %%cx, 288(%%esi)\n\t"
748 " movw %%bx, 160(%%esi)\n\t"
749 " psrlq $32, %%mm1\n\t"
750 " psrlq $32, %%mm3\n\t"
751 " psrlq $32, %%mm5\n\t"
752 " movd %%mm1, %%eax\n\t"
753 " movd %%mm3, %%ecx\n\t"
754 " movd %%mm5, %%ebx\n\t"
755 " movw %%ax, 96(%%edi)\n\t"
756 " movw %%cx, 224(%%edi)\n\t"
757 " movw %%bx, 352(%%edi)\n\t"
759 " movsw\n\t"
761 ".L_bye:\n\t"
762 " femms\n\t"
764 :"m"(a),"m"(b),"m"(c),"m"(tmp[0])
765 :"memory","%eax","%ebx","%ecx","%edx","%esi","%edi");